Skip to content

leading and trailing whitespace is lost when it should be part of text nodes #87

@milahu

Description

@milahu

whitespace is significant for lossless AST transformers

when this is not handled by the parser
then i need extra code in the semantic stage, to lookahead to the next node

similar #40

In the specific case of HTML parsing, I couldn't tell if or when it's reasonable to treat leading and trailing whitespace as significant.

test.html


<div>

  aaa

  bbb

</div>

the right source column has lookbehind source plus node.text
which is easy to do with node.range.end_byte of the previous node

lookahead would be more complex...

tree-sitter-html

node  5 = <               : "<"                            : "\n<"
node 17 = tag_name        : "div"                          : "div"
node  3 = >               : ">"                            : ">"
node 16 = text            : "aaa\n\n  bbb"                 : "\n\n  aaa\n\n  bbb"
node  7 = </              : "</"                           : "\n\n</"
node 17 = tag_name        : "div"                          : "div"
node  3 = >               : ">"                            : ">"

lezer-parser-html

note how both source columns are identical
so this is a truly "lossless" parser (CST parser)

node 16 = Text            : "\n"                           : "\n"
node  6 = StartTag        : "<"                            : "<"
node 22 = TagName         : "div"                          : "div"
node  4 = EndTag          : ">"                            : ">"
node 16 = Text            : "\n\n  aaa\n\n  bbb\n\n"       : "\n\n  aaa\n\n  bbb\n\n"
node 11 = StartCloseTag   : "</"                           : "</"
node 22 = TagName         : "div"                          : "div"
node  4 = EndTag          : ">"                            : ">"
node 16 = Text            : "\n"                           : "\n"

diff

+ node 16 = Text            : "\n"                           : "\n"
- node 16 = text            : "aaa\n\n  bbb"                 : "\n\n  aaa\n\n  bbb"
+ node 16 = Text            : "\n\n  aaa\n\n  bbb\n\n"       : "\n\n  aaa\n\n  bbb\n\n"
+ node 16 = Text            : "\n"                           : "\n"
repro.py
#!/usr/bin/env python3

# pip install tree-sitter tree-sitter-languages

import json
import tree_sitter
import tree_sitter_languages

input_html_bytes = b"""
<div>

  aaa

  bbb

</div>
"""

def walk_html_tree(tree, func):
    # compound tags
    # these are ignored when serializing the tree
    compound_kind_id = [
        25, # fragment
        26, # doctype
        #1, # '<!'
        #3, # '>'
        28, # element
        29, # script_element
        30, # style_element
        31, # start_tag
        34, # self_closing_tag
        35, # end_tag
        37, # attribute
        38, # quoted_attribute_value
        #14, # double quote '"'
        #12, # single quote "'"
        #10, # attribute_value
    ]
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        is_compound = cursor.node.kind_id in compound_kind_id
        #yield cursor.node
        func(cursor.node, is_compound)
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

last_node_to = 0
node_idx = -1

max_len = 30

show_compound_nodes = False

def walk_callback_test(node, is_compound):
    global node_idx
    global last_node_to

    node_text = json.dumps(node.text.decode("utf8"))
    if len(node_text) > max_len:
        node_text = node_text[0:max_len] + "..."

    if not is_compound:
        space_node_text = json.dumps(input_html_bytes[last_node_to:node.range.end_byte].decode("utf8"))
        if len(space_node_text) > max_len:
            space_node_text = space_node_text[0:max_len] + "..."
        line_prefix = "  " if show_compound_nodes else ""
        print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")
        last_node_to = node.range.end_byte
    else:
        if show_compound_nodes:
            line_prefix = "# "
            print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")

    node_idx += 1
    #if node_idx > 20: raise "todo"

tree_sitter_html = tree_sitter_languages.get_parser("html")

html_tree = tree_sitter_html.parse(input_html_bytes)

walk_html_tree(html_tree.root_node, walk_callback_test)
repro.js
#!/usr/bin/env node

/*
npm init -y
npm install @lezer/html
*/

import { parser as lezerParserHtml } from '@lezer/html';

const inputHtml = `
<div>

  aaa

  bbb

</div>
`;

const htmlParser = lezerParserHtml.configure({
    strict: true, // throw on parse error
    //dialect: "selfClosing",
});

const htmlTree = htmlParser.parse(inputHtml);

const rootNode = htmlTree.topNode;

// based on nix-eval-js/src/lezer-parser-nix/src/nix-format.js
/** @param {Tree | TreeNode} tree */
function walkHtmlTree(tree, func) {
    const cursor = tree.cursor();
    //if (!cursor) return '';
    if (!cursor) return;
    let depth = 0;
    while (true) {
        // NLR: Node, Left, Right
        // Node
        // NOTE InvalidEntity breaks the parser
        // <a t="a&amp;b&c">a&amp;b&c</a>
        // -> require valid input, throw on parse error
        const cursorTypeId = cursor.type.id;
        if (
            //true || // debug: dont filter
            !(
                cursorTypeId == 15 || // Document
                cursorTypeId == 20 || // Element
                cursorTypeId == 23 || // Attribute
                cursorTypeId == 21 || // OpenTag <script>
                cursorTypeId == 30 || // OpenTag <style>
                cursorTypeId == 36 || // OpenTag
                cursorTypeId == 32 || // CloseTag </style>
                cursorTypeId == 29 || // CloseTag </script>
                cursorTypeId == 37 || // CloseTag
                cursorTypeId == 38 || // SelfClosingTag
                // note: this is inconsistent in the parser
                // InvalidEntity is child node
                // EntityReference is separate node (sibling of other text nodes)
                cursorTypeId == 19 || // InvalidEntity: <a href="?a=1&b=2" -> "&" is parsed as InvalidEntity
                //cursorTypeId == 17 || // EntityReference: "&amp;" or "&mdash;" is parsed as EntityReference
                false
            )
        ) {
            func(cursor)
        }
        // Left
        if (cursor.firstChild()) {
            // moved down
            depth++;
            continue;
        }
        // Right
        if (depth > 0 && cursor.nextSibling()) {
            // moved right
            continue;
        }
        let continueMainLoop = false;
        let firstUp = true;
        while (cursor.parent()) {
            // moved up
            depth--;
            if (depth <= 0) {
                // when tree is a node, stop at the end of node
                // == dont visit sibling or parent nodes
                return;
            }
            if (cursor.nextSibling()) {
                // moved up + right
                continueMainLoop = true;
                break;
            }
            firstUp = false;
        }
        if (continueMainLoop) continue;
        break;
    }
}

let lastNodeTo = 0;
const maxLen = 30;
walkHtmlTree(rootNode, (node) => {
    let nodeSource = JSON.stringify(inputHtml.slice(node.from, node.to));
    let spaceNodeSource = JSON.stringify(inputHtml.slice(lastNodeTo, node.to));
    if (nodeSource.length > maxLen) {
        nodeSource = nodeSource.slice(0, maxLen);
    }
    if (spaceNodeSource.length > maxLen) {
        spaceNodeSource = spaceNodeSource.slice(0, maxLen);
    }
    console.log(`node ${String(node.type.id).padStart(2)} = ${node.type.name.padEnd(15)} : ${nodeSource.padEnd(maxLen)} : ${spaceNodeSource}`);
    lastNodeTo = node.to;
});

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions