Skip to content

doctype: parse all child nodes #83

@milahu

Description

@milahu

input

<!doctype html><hr>

result: compound nodes are prefixed with #

# node 25 = fragment: '<!doctype html><hr>'
# node 26 = doctype: '<!doctype html>'
node 1 = <!: '<!' -> '<!'
node 4 = doctype: 'doctype' -> 'doctype'
node 3 = >: '>' -> ' html>'
# node 28 = element: '<hr>'
# node 31 = start_tag: '<hr>'
node 5 = <: '<' -> '<'
node 17 = tag_name: 'hr' -> 'hr'
node 3 = >: '>' -> '>'

problem: the 'html' in '<!doctype html>' has no parse node
and the close tag '>' of '<!doctype html>'
has the same node type as the close tag '>' of '<hr>'

note how ' html' spills into '>'
with node_source = input_html[last_node_to:node.range.end_byte]

node 1 = <!: '<!' -> '<!'
node 4 = doctype: 'doctype' -> 'doctype'
node 3 = >: '>' -> ' html>'

this is causing problems in a semantic stage using this parser
where i want to ...

either ignore the compound node '<!doctype html>'
and process its child nodes '<!' and 'doctype' and 'html' and '>'

or process the compound node and ignore its child nodes

the cheap solution would be
to use a different node type for '>' of '<!doctype html>'

Details
# https://github.com/tree-sitter/py-tree-sitter/issues/33
#def traverse_tree(tree: Tree):
def walk_html_tree(tree, func):
    # compound tags
    # these are ignored when serializing the tree
    compound_kind_id = [
        25, # fragment
        26, # doctype
        #1, # '<!'
        #3, # '>'
        28, # element
        29, # script_element
        30, # style_element
        31, # start_tag
        34, # self_closing_tag
        35, # end_tag
        37, # attribute
        38, # quoted_attribute_value
        #14, # double quote '"'
        #12, # single quote "'"
        #10, # attribute_value
    ]
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        is_compound = cursor.node.kind_id in compound_kind_id
        func(cursor.node, is_compound)
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

last_node_to = 0

input_html = """<!doctype html><hr>"""

def walk_callback(node, is_compound):
    nonlocal walk_html_tree_test_result, last_node_to

    s = repr(node.text.decode("utf8"))
    if len(s) > 50:
        s = s[0:50] + "..."

    if not is_compound:
        node_source = input_html[last_node_to:node.range.end_byte]
        last_node_to = node.range.end_byte
        node_source = node_source.decode("utf8")
        if len(node_source) > 50:
            node_source = node_source[0:50] + "..."
        print(f"node {node.kind_id} = {node.type}: {s} -> {repr(node_source)}")
    else:
        print(f"# node {node.kind_id} = {node.type}: {s}")

import tree_sitter
import tree_sitter_languages

tree_sitter_html = tree_sitter_languages.get_parser("html")
html_parser = tree_sitter_html

html_tree = html_parser.parse(input_html)
top_node = html_tree.root_node

walk_html_tree(top_node, walk_callback)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions