|
| 1 | +From 0cd2add6c46400b808329442f81451b369863983 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> |
| 3 | +Date: Sat, 26 Aug 2023 15:08:59 +0200 |
| 4 | +Subject: [PATCH 1/6] Expose line and column information for use in PHP |
| 5 | + |
| 6 | +--- |
| 7 | + source/lexbor/dom/interfaces/node.h | 2 ++ |
| 8 | + source/lexbor/html/token.h | 2 ++ |
| 9 | + source/lexbor/html/tokenizer.c | 24 +++++++++++++++++++++++- |
| 10 | + source/lexbor/html/tokenizer.h | 2 ++ |
| 11 | + source/lexbor/html/tokenizer/state.h | 2 ++ |
| 12 | + source/lexbor/html/tree.c | 11 +++++++++++ |
| 13 | + source/lexbor/html/tree/error.c | 5 +++-- |
| 14 | + source/lexbor/html/tree/error.h | 5 +++-- |
| 15 | + 8 files changed, 48 insertions(+), 5 deletions(-) |
| 16 | + |
| 17 | +diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h |
| 18 | +index 6c74ac5..b95373c 100644 |
| 19 | +--- a/source/lexbor/dom/interfaces/node.h |
| 20 | ++++ b/source/lexbor/dom/interfaces/node.h |
| 21 | +@@ -86,6 +86,8 @@ struct lxb_dom_node { |
| 22 | + |
| 23 | + lxb_dom_node_type_t type; |
| 24 | + |
| 25 | ++ size_t line; |
| 26 | ++ |
| 27 | + #ifdef LXB_DOM_NODE_USER_VARIABLES |
| 28 | + LXB_DOM_NODE_USER_VARIABLES |
| 29 | + #endif /* LXB_DOM_NODE_USER_VARIABLES */ |
| 30 | +diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h |
| 31 | +index 79accd0..0b7f4fd 100644 |
| 32 | +--- a/source/lexbor/html/token.h |
| 33 | ++++ b/source/lexbor/html/token.h |
| 34 | +@@ -33,6 +33,8 @@ enum lxb_html_token_type { |
| 35 | + typedef struct { |
| 36 | + const lxb_char_t *begin; |
| 37 | + const lxb_char_t *end; |
| 38 | ++ size_t line; |
| 39 | ++ size_t column; |
| 40 | + |
| 41 | + const lxb_char_t *text_start; |
| 42 | + const lxb_char_t *text_end; |
| 43 | +diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c |
| 44 | +index 22b88ed..1d9f378 100644 |
| 45 | +--- a/source/lexbor/html/tokenizer.c |
| 46 | ++++ b/source/lexbor/html/tokenizer.c |
| 47 | +@@ -92,6 +92,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz) |
| 48 | + |
| 49 | + tkz->pos = tkz->start; |
| 50 | + tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE; |
| 51 | ++ /* current_line & current_column already initialized by calloc (zero-based) */ |
| 52 | + |
| 53 | + tkz->tree = NULL; |
| 54 | + tkz->tags = NULL; |
| 55 | +@@ -153,6 +154,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to, |
| 56 | + tkz_to->start = tkz_from->start; |
| 57 | + tkz_to->end = tkz_from->end; |
| 58 | + tkz_to->pos = tkz_to->start; |
| 59 | ++ tkz_to->current_line = tkz_from->current_line; |
| 60 | ++ tkz_to->current_column = tkz_from->current_column; |
| 61 | + |
| 62 | + return LXB_STATUS_OK; |
| 63 | + } |
| 64 | +@@ -571,7 +574,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, |
| 65 | + tkz->last = end; |
| 66 | + |
| 67 | + while (data < end) { |
| 68 | +- data = tkz->state(tkz, data, end); |
| 69 | ++ size_t current_column = tkz->current_column; |
| 70 | ++ const lxb_char_t *new_data = tkz->state(tkz, data, end); |
| 71 | ++ while (data < new_data) { |
| 72 | ++ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */ |
| 73 | ++ if (*data == '\n') { |
| 74 | ++ tkz->current_line++; |
| 75 | ++ current_column = 0; |
| 76 | ++ } else { |
| 77 | ++ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code. |
| 78 | ++ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */ |
| 79 | ++ if ((*data & 0b11000000) == 0b10000000) { |
| 80 | ++ /* Continuation byte, do nothing */ |
| 81 | ++ } else { |
| 82 | ++ /* First byte for a codepoint */ |
| 83 | ++ current_column++; |
| 84 | ++ } |
| 85 | ++ } |
| 86 | ++ data++; |
| 87 | ++ } |
| 88 | ++ tkz->current_column = current_column; |
| 89 | + } |
| 90 | + |
| 91 | + return tkz->status; |
| 92 | +diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h |
| 93 | +index 12b7c81..aa1ac37 100644 |
| 94 | +--- a/source/lexbor/html/tokenizer.h |
| 95 | ++++ b/source/lexbor/html/tokenizer.h |
| 96 | +@@ -79,6 +79,8 @@ struct lxb_html_tokenizer { |
| 97 | + const lxb_char_t *end; |
| 98 | + const lxb_char_t *begin; |
| 99 | + const lxb_char_t *last; |
| 100 | ++ size_t current_line; |
| 101 | ++ size_t current_column; |
| 102 | + |
| 103 | + /* Entities */ |
| 104 | + const lexbor_sbst_entry_static_t *entity; |
| 105 | +diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h |
| 106 | +index 5e91444..52eaa9a 100644 |
| 107 | +--- a/source/lexbor/html/tokenizer/state.h |
| 108 | ++++ b/source/lexbor/html/tokenizer/state.h |
| 109 | +@@ -90,6 +90,8 @@ extern "C" { |
| 110 | + do { \ |
| 111 | + tkz->pos = tkz->start; \ |
| 112 | + tkz->token->begin = v_begin; \ |
| 113 | ++ tkz->token->line = tkz->current_line; \ |
| 114 | ++ tkz->token->column = tkz->current_column; \ |
| 115 | + } \ |
| 116 | + while (0) |
| 117 | + |
| 118 | +diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c |
| 119 | +index 062ea56..3f4c18d 100644 |
| 120 | +--- a/source/lexbor/html/tree.c |
| 121 | ++++ b/source/lexbor/html/tree.c |
| 122 | +@@ -431,6 +431,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree, |
| 123 | + return NULL; |
| 124 | + } |
| 125 | + |
| 126 | ++ node->line = token->line; |
| 127 | ++ /* We only expose line number in PHP DOM */ |
| 128 | ++ |
| 129 | + lxb_status_t status; |
| 130 | + lxb_dom_element_t *element = lxb_dom_interface_element(node); |
| 131 | + |
| 132 | +@@ -767,6 +770,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree, |
| 133 | + |
| 134 | + lxb_dom_interface_text(text)->char_data.data = *str; |
| 135 | + |
| 136 | ++ if (tree->tkz_ref) { |
| 137 | ++ text->line = tree->tkz_ref->token->line; |
| 138 | ++ /* We only expose line number in PHP DOM */ |
| 139 | ++ } |
| 140 | ++ |
| 141 | + if (ret_node != NULL) { |
| 142 | + *ret_node = text; |
| 143 | + } |
| 144 | +@@ -806,6 +814,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree, |
| 145 | + return NULL; |
| 146 | + } |
| 147 | + |
| 148 | ++ node->line = token->line; |
| 149 | ++ /* We only expose line number in PHP DOM */ |
| 150 | ++ |
| 151 | + tree->status = lxb_html_token_make_text(token, &comment->char_data.data, |
| 152 | + tree->document->dom_document.text); |
| 153 | + if (tree->status != LXB_STATUS_OK) { |
| 154 | +diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c |
| 155 | +index ffdc55c..ef36eab 100644 |
| 156 | +--- a/source/lexbor/html/tree/error.c |
| 157 | ++++ b/source/lexbor/html/tree/error.c |
| 158 | +@@ -22,8 +22,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors, |
| 159 | + } |
| 160 | + |
| 161 | + entry->id = id; |
| 162 | +- entry->begin = token->begin; |
| 163 | +- entry->end = token->end; |
| 164 | ++ entry->line = token->line; |
| 165 | ++ entry->column = token->column; |
| 166 | ++ entry->length = token->end - token->begin; |
| 167 | + |
| 168 | + return entry; |
| 169 | + } |
| 170 | +diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h |
| 171 | +index 7a212af..b186772 100644 |
| 172 | +--- a/source/lexbor/html/tree/error.h |
| 173 | ++++ b/source/lexbor/html/tree/error.h |
| 174 | +@@ -109,8 +109,9 @@ lxb_html_tree_error_id_t; |
| 175 | + |
| 176 | + typedef struct { |
| 177 | + lxb_html_tree_error_id_t id; |
| 178 | +- const lxb_char_t *begin; |
| 179 | +- const lxb_char_t *end; |
| 180 | ++ size_t line; |
| 181 | ++ size_t column; |
| 182 | ++ size_t length; |
| 183 | + } |
| 184 | + lxb_html_tree_error_t; |
| 185 | + |
| 186 | +-- |
| 187 | +2.51.2 |
| 188 | + |
0 commit comments