php
diff --git a/‎ext/dom/lexbor/patches/0001-Expose-line-and-column-information-for-use-in-PHP.patch‎
Lines changed: 188 additions & 0 deletions b/‎ext/dom/lexbor/patches/0001-Expose-line-and-column-information-for-use-in-PHP.patch‎
Lines changed: 188 additions & 0 deletions
diff --git a/‎ext/dom/lexbor/patches/0002-Track-implied-added-nodes-for-options-use-in-PHP.patch‎
Lines changed: 67 additions & 0 deletions b/‎ext/dom/lexbor/patches/0002-Track-implied-added-nodes-for-options-use-in-PHP.patch‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎ext/dom/lexbor/patches/0003-Patch-utilities-and-data-structure-to-be-able-to-gen.patch‎
Lines changed: 97 additions & 0 deletions b/‎ext/dom/lexbor/patches/0003-Patch-utilities-and-data-structure-to-be-able-to-gen.patch‎
Lines changed: 97 additions & 0 deletions
@@ -0,0 +1,188 @@
+From 0cd2add6c46400b808329442f81451b369863983 Mon Sep 17 00:00:00 2001
+From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
+Date: Sat, 26 Aug 2023 15:08:59 +0200
+Subject: [PATCH 1/6] Expose line and column information for use in PHP
+
+---
+ source/lexbor/dom/interfaces/node.h  |  2 ++
+ source/lexbor/html/token.h           |  2 ++
+ source/lexbor/html/tokenizer.c       | 24 +++++++++++++++++++++++-
+ source/lexbor/html/tokenizer.h       |  2 ++
+ source/lexbor/html/tokenizer/state.h |  2 ++
+ source/lexbor/html/tree.c            | 11 +++++++++++
+ source/lexbor/html/tree/error.c      |  5 +++--
+ source/lexbor/html/tree/error.h      |  5 +++--
+ 8 files changed, 48 insertions(+), 5 deletions(-)
+
+diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
+index 6c74ac5..b95373c 100644
+--- a/source/lexbor/dom/interfaces/node.h
++++ b/source/lexbor/dom/interfaces/node.h
+@@ -86,6 +86,8 @@ struct lxb_dom_node {
+ 
+     lxb_dom_node_type_t    type;
+ 
++    size_t                 line;
++
+ #ifdef LXB_DOM_NODE_USER_VARIABLES
+     LXB_DOM_NODE_USER_VARIABLES
+ #endif /* LXB_DOM_NODE_USER_VARIABLES */
+diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h
+index 79accd0..0b7f4fd 100644
+--- a/source/lexbor/html/token.h
++++ b/source/lexbor/html/token.h
+@@ -33,6 +33,8 @@ enum lxb_html_token_type {
+ typedef struct {
+     const lxb_char_t      *begin;
+     const lxb_char_t      *end;
++    size_t                line;
++    size_t                column;
+ 
+     const lxb_char_t      *text_start;
+     const lxb_char_t      *text_end;
+diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
+index 22b88ed..1d9f378 100644
+--- a/source/lexbor/html/tokenizer.c
++++ b/source/lexbor/html/tokenizer.c
+@@ -92,6 +92,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
+ 
+     tkz->pos = tkz->start;
+     tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
++    /* current_line & current_column already initialized by calloc (zero-based) */
+ 
+     tkz->tree = NULL;
+     tkz->tags = NULL;
+@@ -153,6 +154,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
+     tkz_to->start = tkz_from->start;
+     tkz_to->end = tkz_from->end;
+     tkz_to->pos = tkz_to->start;
++    tkz_to->current_line = tkz_from->current_line;
++    tkz_to->current_column = tkz_from->current_column;
+ 
+     return LXB_STATUS_OK;
+ }
+@@ -571,7 +574,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
+     tkz->last = end;
+ 
+     while (data < end) {
+-        data = tkz->state(tkz, data, end);
++        size_t current_column = tkz->current_column;
++        const lxb_char_t *new_data = tkz->state(tkz, data, end);
++        while (data < new_data) {
++            /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
++            if (*data == '\n') {
++                tkz->current_line++;
++                current_column = 0;
++            } else {
++                /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
++                 * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
++                if ((*data & 0b11000000) == 0b10000000) {
++                    /* Continuation byte, do nothing */
++                } else {
++                    /* First byte for a codepoint */
++                    current_column++;
++                }
++            }
++            data++;
++        }
++        tkz->current_column = current_column;
+     }
+ 
+     return tkz->status;
+diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h
+index 12b7c81..aa1ac37 100644
+--- a/source/lexbor/html/tokenizer.h
++++ b/source/lexbor/html/tokenizer.h
+@@ -79,6 +79,8 @@ struct lxb_html_tokenizer {
+     const lxb_char_t                 *end;
+     const lxb_char_t                 *begin;
+     const lxb_char_t                 *last;
++    size_t                           current_line;
++    size_t                           current_column;
+ 
+     /* Entities */
+     const lexbor_sbst_entry_static_t *entity;
+diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h
+index 5e91444..52eaa9a 100644
+--- a/source/lexbor/html/tokenizer/state.h
++++ b/source/lexbor/html/tokenizer/state.h
+@@ -90,6 +90,8 @@ extern "C" {
+     do {                                                                       \
+         tkz->pos = tkz->start;                                                 \
+         tkz->token->begin = v_begin;                                           \
++        tkz->token->line = tkz->current_line;                                  \
++        tkz->token->column = tkz->current_column;                              \
+     }                                                                          \
+     while (0)
+ 
+diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c
+index 062ea56..3f4c18d 100644
+--- a/source/lexbor/html/tree.c
++++ b/source/lexbor/html/tree.c
+@@ -431,6 +431,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree,
+         return NULL;
+     }
+ 
++    node->line = token->line;
++    /* We only expose line number in PHP DOM */
++
+     lxb_status_t status;
+     lxb_dom_element_t *element = lxb_dom_interface_element(node);
+ 
+@@ -767,6 +770,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree,
+ 
+     lxb_dom_interface_text(text)->char_data.data = *str;
+ 
++    if (tree->tkz_ref) {
++        text->line = tree->tkz_ref->token->line;
++        /* We only expose line number in PHP DOM */
++    }
++
+     if (ret_node != NULL) {
+         *ret_node = text;
+     }
+@@ -806,6 +814,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree,
+         return NULL;
+     }
+ 
++    node->line = token->line;
++    /* We only expose line number in PHP DOM */
++
+     tree->status = lxb_html_token_make_text(token, &comment->char_data.data,
+                                             tree->document->dom_document.text);
+     if (tree->status != LXB_STATUS_OK) {
+diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c
+index ffdc55c..ef36eab 100644
+--- a/source/lexbor/html/tree/error.c
++++ b/source/lexbor/html/tree/error.c
+@@ -22,8 +22,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors,
+     }
+ 
+     entry->id = id;
+-    entry->begin = token->begin;
+-    entry->end = token->end;
++    entry->line = token->line;
++    entry->column = token->column;
++    entry->length = token->end - token->begin;
+ 
+     return entry;
+ }
+diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h
+index 7a212af..b186772 100644
+--- a/source/lexbor/html/tree/error.h
++++ b/source/lexbor/html/tree/error.h
+@@ -109,8 +109,9 @@ lxb_html_tree_error_id_t;
+ 
+ typedef struct {
+     lxb_html_tree_error_id_t id;
+-    const lxb_char_t         *begin;
+-    const lxb_char_t         *end;
++    size_t                   line;
++    size_t                   column;
++    size_t                   length;
+ }
+ lxb_html_tree_error_t;
+ 
+-- 
+2.51.2
+
@@ -0,0 +1,67 @@
+From a4c29ba8d1ea1065ce6bd4a34382d53140cf1924 Mon Sep 17 00:00:00 2001
+From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
+Date: Mon, 14 Aug 2023 20:18:51 +0200
+Subject: [PATCH 2/6] Track implied added nodes for options use in PHP
+
+---
+ source/lexbor/html/tree.h                            | 3 +++
+ source/lexbor/html/tree/insertion_mode/after_head.c  | 1 +
+ source/lexbor/html/tree/insertion_mode/before_head.c | 2 ++
+ source/lexbor/html/tree/insertion_mode/before_html.c | 2 ++
+ 4 files changed, 8 insertions(+)
+
+diff --git a/source/lexbor/html/tree.h b/source/lexbor/html/tree.h
+index 4912efb..7b2c620 100644
+--- a/source/lexbor/html/tree.h
++++ b/source/lexbor/html/tree.h
+@@ -55,6 +55,9 @@ struct lxb_html_tree {
+     bool                           foster_parenting;
+     bool                           frameset_ok;
+     bool                           scripting;
++    bool                           has_explicit_html_tag;
++    bool                           has_explicit_head_tag;
++    bool                           has_explicit_body_tag;
+ 
+     lxb_html_tree_insertion_mode_f mode;
+     lxb_html_tree_insertion_mode_f original_mode;
+diff --git a/source/lexbor/html/tree/insertion_mode/after_head.c b/source/lexbor/html/tree/insertion_mode/after_head.c
+index ad551b5..1448654 100644
+--- a/source/lexbor/html/tree/insertion_mode/after_head.c
++++ b/source/lexbor/html/tree/insertion_mode/after_head.c
+@@ -71,6 +71,7 @@ lxb_html_tree_insertion_mode_after_head_open(lxb_html_tree_t *tree,
+                 return lxb_html_tree_process_abort(tree);
+             }
+ 
++            tree->has_explicit_body_tag = true;
+             tree->frameset_ok = false;
+             tree->mode = lxb_html_tree_insertion_mode_in_body;
+ 
+diff --git a/source/lexbor/html/tree/insertion_mode/before_head.c b/source/lexbor/html/tree/insertion_mode/before_head.c
+index 14621f2..cd2ac2a 100644
+--- a/source/lexbor/html/tree/insertion_mode/before_head.c
++++ b/source/lexbor/html/tree/insertion_mode/before_head.c
+@@ -67,6 +67,8 @@ lxb_html_tree_insertion_mode_before_head_open(lxb_html_tree_t *tree,
+                 return lxb_html_tree_process_abort(tree);
+             }
+ 
++            tree->has_explicit_head_tag = true;
++
+             tree->mode = lxb_html_tree_insertion_mode_in_head;
+ 
+             break;
+diff --git a/source/lexbor/html/tree/insertion_mode/before_html.c b/source/lexbor/html/tree/insertion_mode/before_html.c
+index 05fe738..1e09cda 100644
+--- a/source/lexbor/html/tree/insertion_mode/before_html.c
++++ b/source/lexbor/html/tree/insertion_mode/before_html.c
+@@ -78,6 +78,8 @@ lxb_html_tree_insertion_mode_before_html_open(lxb_html_tree_t *tree,
+                 return lxb_html_tree_process_abort(tree);
+             }
+ 
++            tree->has_explicit_html_tag = true;
++
+             tree->mode = lxb_html_tree_insertion_mode_before_head;
+ 
+             break;
+-- 
+2.51.2
+
@@ -0,0 +1,97 @@
+From 46fc776449252e74795569759a19d13857a59069 Mon Sep 17 00:00:00 2001
+From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
+Date: Thu, 24 Aug 2023 22:57:48 +0200
+Subject: [PATCH 3/6] Patch utilities and data structure to be able to generate
+ smaller lookup tables
+
+Changed the generation script to check if everything fits in 32-bits.
+And change the actual field types to 32-bits. This decreases the hash
+tables in size.
+---
+ source/lexbor/core/shs.h             |  4 ++--
+ utils/lexbor/encoding/single-byte.py |  4 ++--
+ utils/lexbor/lexbor/LXB.py           | 12 +++++++++---
+ 3 files changed, 13 insertions(+), 7 deletions(-)
+
+diff --git a/source/lexbor/core/shs.h b/source/lexbor/core/shs.h
+index 7a63a07..c84dfaa 100644
+--- a/source/lexbor/core/shs.h
++++ b/source/lexbor/core/shs.h
+@@ -27,9 +27,9 @@ lexbor_shs_entry_t;
+ 
+ typedef struct {
+     uint32_t key;
+-    void     *value;
++    uint32_t value;
+ 
+-    size_t   next;
++    uint32_t next;
+ }
+ lexbor_shs_hash_t;
+ 
+diff --git a/utils/lexbor/encoding/single-byte.py b/utils/lexbor/encoding/single-byte.py
+index d7d1bb2..5420c16 100755
+--- a/utils/lexbor/encoding/single-byte.py
++++ b/utils/lexbor/encoding/single-byte.py
+@@ -128,7 +128,7 @@ class SingleByte:
+                 entries = values[idx]
+                 key_id = entries[1].decode('utf-8')
+ 
+-                hash_key.append(key_id, '(void *) {}'.format(idx + 0x80))
++                hash_key.append(key_id, idx + 0x80)
+ 
+         return hash_key.create(rate = 1)
+ 
+@@ -161,7 +161,7 @@ def toHex(s):
+     lst = []
+ 
+     for ch in bytes(s, 'utf-8'):
+-        hv = hex(ch).replace('0x', '\\\\x')
++        hv = hex(ch).replace('0x', '\\x')
+         lst.append("'{}'".format(hv))
+ 
+     return ', '.join(lst)
+diff --git a/utils/lexbor/lexbor/LXB.py b/utils/lexbor/lexbor/LXB.py
+index 3e75812..2370c66 100755
+--- a/utils/lexbor/lexbor/LXB.py
++++ b/utils/lexbor/lexbor/LXB.py
+@@ -94,7 +94,7 @@ class HashKey:
+     def append(self, key_id, value):
+         self.buffer.append([self.hash_id(int(key_id, 0)), value])
+ 
+-    def create(self, terminate_value = '{0, NULL, 0}', rate = 2, is_const = True, data_before = None):
++    def create(self, terminate_value = '{0, 0, 0}', rate = 2, is_const = True, data_before = None):
+         test = self.test(int(self.max_table_size / 1.2), int(self.max_table_size * 1.2))
+ 
+         rate_dn = rate - 1
+@@ -142,9 +142,12 @@ class HashKey:
+             entry = table[idx]
+ 
+             if entry:
++                assert entry[0] < 2**32
++                assert entry[1] < 2**32
++                assert entry[2] < 2**32
+                 result.append("{{{}, {}, {}}},".format(entry[0], entry[1], entry[2]))
+             else:
+-                result.append("{0, NULL, 0},")
++                result.append("{0, 0, 0},")
+ 
+             if int(idx) % rate == rate_dn:
+                 result.append("\n    ")
+@@ -154,9 +157,12 @@ class HashKey:
+         if len(table):
+             entry = table[-1]
+             if entry:
++                assert entry[0] < 2**32
++                assert entry[1] < 2**32
++                assert entry[2] < 2**32
+                 result.append("{{{}, {}, {}}}\n".format(entry[0], entry[1], entry[2]))
+             else:
+-                result.append("{0, NULL, 0}\n")
++                result.append("{0, 0, 0}\n")
+ 
+         result.append("};")
+ 
+-- 
+2.51.2
+