whitespace is significant for lossless AST transformers
when this is not handled by the parser
then i need extra code in the semantic stage, to lookahead to the next node
similar #40
In the specific case of HTML parsing, I couldn't tell if or when it's reasonable to treat leading and trailing whitespace as significant.
test.html
the right source column has lookbehind source plus node.text
which is easy to do with node.range.end_byte of the previous node
lookahead would be more complex...
tree-sitter-html
node 5 = < : "<" : "\n<"
node 17 = tag_name : "div" : "div"
node 3 = > : ">" : ">"
node 16 = text : "aaa\n\n bbb" : "\n\n aaa\n\n bbb"
node 7 = </ : "</" : "\n\n</"
node 17 = tag_name : "div" : "div"
node 3 = > : ">" : ">"
lezer-parser-html
note how both source columns are identical
so this is a truly "lossless" parser (CST parser)
node 16 = Text : "\n" : "\n"
node 6 = StartTag : "<" : "<"
node 22 = TagName : "div" : "div"
node 4 = EndTag : ">" : ">"
node 16 = Text : "\n\n aaa\n\n bbb\n\n" : "\n\n aaa\n\n bbb\n\n"
node 11 = StartCloseTag : "</" : "</"
node 22 = TagName : "div" : "div"
node 4 = EndTag : ">" : ">"
node 16 = Text : "\n" : "\n"
diff
+ node 16 = Text : "\n" : "\n"
- node 16 = text : "aaa\n\n bbb" : "\n\n aaa\n\n bbb"
+ node 16 = Text : "\n\n aaa\n\n bbb\n\n" : "\n\n aaa\n\n bbb\n\n"
+ node 16 = Text : "\n" : "\n"
repro.py
#!/usr/bin/env python3
# pip install tree-sitter tree-sitter-languages
import json
import tree_sitter
import tree_sitter_languages
input_html_bytes = b"""
<div>
aaa
bbb
</div>
"""
def walk_html_tree(tree, func):
# compound tags
# these are ignored when serializing the tree
compound_kind_id = [
25, # fragment
26, # doctype
#1, # '<!'
#3, # '>'
28, # element
29, # script_element
30, # style_element
31, # start_tag
34, # self_closing_tag
35, # end_tag
37, # attribute
38, # quoted_attribute_value
#14, # double quote '"'
#12, # single quote "'"
#10, # attribute_value
]
cursor = tree.walk()
reached_root = False
while reached_root == False:
is_compound = cursor.node.kind_id in compound_kind_id
#yield cursor.node
func(cursor.node, is_compound)
if cursor.goto_first_child():
continue
if cursor.goto_next_sibling():
continue
retracing = True
while retracing:
if not cursor.goto_parent():
retracing = False
reached_root = True
if cursor.goto_next_sibling():
retracing = False
last_node_to = 0
node_idx = -1
max_len = 30
show_compound_nodes = False
def walk_callback_test(node, is_compound):
global node_idx
global last_node_to
node_text = json.dumps(node.text.decode("utf8"))
if len(node_text) > max_len:
node_text = node_text[0:max_len] + "..."
if not is_compound:
space_node_text = json.dumps(input_html_bytes[last_node_to:node.range.end_byte].decode("utf8"))
if len(space_node_text) > max_len:
space_node_text = space_node_text[0:max_len] + "..."
line_prefix = " " if show_compound_nodes else ""
print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")
last_node_to = node.range.end_byte
else:
if show_compound_nodes:
line_prefix = "# "
print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")
node_idx += 1
#if node_idx > 20: raise "todo"
tree_sitter_html = tree_sitter_languages.get_parser("html")
html_tree = tree_sitter_html.parse(input_html_bytes)
walk_html_tree(html_tree.root_node, walk_callback_test)
repro.js
#!/usr/bin/env node
/*
npm init -y
npm install @lezer/html
*/
import { parser as lezerParserHtml } from '@lezer/html';
const inputHtml = `
<div>
aaa
bbb
</div>
`;
const htmlParser = lezerParserHtml.configure({
strict: true, // throw on parse error
//dialect: "selfClosing",
});
const htmlTree = htmlParser.parse(inputHtml);
const rootNode = htmlTree.topNode;
// based on nix-eval-js/src/lezer-parser-nix/src/nix-format.js
/** @param {Tree | TreeNode} tree */
function walkHtmlTree(tree, func) {
const cursor = tree.cursor();
//if (!cursor) return '';
if (!cursor) return;
let depth = 0;
while (true) {
// NLR: Node, Left, Right
// Node
// NOTE InvalidEntity breaks the parser
// <a t="a&b&c">a&b&c</a>
// -> require valid input, throw on parse error
const cursorTypeId = cursor.type.id;
if (
//true || // debug: dont filter
!(
cursorTypeId == 15 || // Document
cursorTypeId == 20 || // Element
cursorTypeId == 23 || // Attribute
cursorTypeId == 21 || // OpenTag <script>
cursorTypeId == 30 || // OpenTag <style>
cursorTypeId == 36 || // OpenTag
cursorTypeId == 32 || // CloseTag </style>
cursorTypeId == 29 || // CloseTag </script>
cursorTypeId == 37 || // CloseTag
cursorTypeId == 38 || // SelfClosingTag
// note: this is inconsistent in the parser
// InvalidEntity is child node
// EntityReference is separate node (sibling of other text nodes)
cursorTypeId == 19 || // InvalidEntity: <a href="?a=1&b=2" -> "&" is parsed as InvalidEntity
//cursorTypeId == 17 || // EntityReference: "&" or "—" is parsed as EntityReference
false
)
) {
func(cursor)
}
// Left
if (cursor.firstChild()) {
// moved down
depth++;
continue;
}
// Right
if (depth > 0 && cursor.nextSibling()) {
// moved right
continue;
}
let continueMainLoop = false;
let firstUp = true;
while (cursor.parent()) {
// moved up
depth--;
if (depth <= 0) {
// when tree is a node, stop at the end of node
// == dont visit sibling or parent nodes
return;
}
if (cursor.nextSibling()) {
// moved up + right
continueMainLoop = true;
break;
}
firstUp = false;
}
if (continueMainLoop) continue;
break;
}
}
let lastNodeTo = 0;
const maxLen = 30;
walkHtmlTree(rootNode, (node) => {
let nodeSource = JSON.stringify(inputHtml.slice(node.from, node.to));
let spaceNodeSource = JSON.stringify(inputHtml.slice(lastNodeTo, node.to));
if (nodeSource.length > maxLen) {
nodeSource = nodeSource.slice(0, maxLen);
}
if (spaceNodeSource.length > maxLen) {
spaceNodeSource = spaceNodeSource.slice(0, maxLen);
}
console.log(`node ${String(node.type.id).padStart(2)} = ${node.type.name.padEnd(15)} : ${nodeSource.padEnd(maxLen)} : ${spaceNodeSource}`);
lastNodeTo = node.to;
});
whitespace is significant for lossless AST transformers
when this is not handled by the parser
then i need extra code in the semantic stage, to lookahead to the next node
similar #40
test.html
the right source column has lookbehind source plus
node.textwhich is easy to do with
node.range.end_byteof the previous nodelookahead would be more complex...
tree-sitter-html
lezer-parser-html
note how both source columns are identical
so this is a truly "lossless" parser (CST parser)
diff
repro.py
repro.js