Skip to content

Commit d58d3d2

Browse files
committed
Merge branch 'PHP-8.4' into PHP-8.5
* PHP-8.4: Upgrade Lexbor to v2.7.0
2 parents 1c6c7bc + 16baee5 commit d58d3d2

File tree

169 files changed

+55766
-35285
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

169 files changed

+55766
-35285
lines changed
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
From 0cd2add6c46400b808329442f81451b369863983 Mon Sep 17 00:00:00 2001
2+
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
3+
Date: Sat, 26 Aug 2023 15:08:59 +0200
4+
Subject: [PATCH 1/6] Expose line and column information for use in PHP
5+
6+
---
7+
source/lexbor/dom/interfaces/node.h | 2 ++
8+
source/lexbor/html/token.h | 2 ++
9+
source/lexbor/html/tokenizer.c | 24 +++++++++++++++++++++++-
10+
source/lexbor/html/tokenizer.h | 2 ++
11+
source/lexbor/html/tokenizer/state.h | 2 ++
12+
source/lexbor/html/tree.c | 11 +++++++++++
13+
source/lexbor/html/tree/error.c | 5 +++--
14+
source/lexbor/html/tree/error.h | 5 +++--
15+
8 files changed, 48 insertions(+), 5 deletions(-)
16+
17+
diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
18+
index 6c74ac5..b95373c 100644
19+
--- a/source/lexbor/dom/interfaces/node.h
20+
+++ b/source/lexbor/dom/interfaces/node.h
21+
@@ -86,6 +86,8 @@ struct lxb_dom_node {
22+
23+
lxb_dom_node_type_t type;
24+
25+
+ size_t line;
26+
+
27+
#ifdef LXB_DOM_NODE_USER_VARIABLES
28+
LXB_DOM_NODE_USER_VARIABLES
29+
#endif /* LXB_DOM_NODE_USER_VARIABLES */
30+
diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h
31+
index 79accd0..0b7f4fd 100644
32+
--- a/source/lexbor/html/token.h
33+
+++ b/source/lexbor/html/token.h
34+
@@ -33,6 +33,8 @@ enum lxb_html_token_type {
35+
typedef struct {
36+
const lxb_char_t *begin;
37+
const lxb_char_t *end;
38+
+ size_t line;
39+
+ size_t column;
40+
41+
const lxb_char_t *text_start;
42+
const lxb_char_t *text_end;
43+
diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
44+
index 22b88ed..1d9f378 100644
45+
--- a/source/lexbor/html/tokenizer.c
46+
+++ b/source/lexbor/html/tokenizer.c
47+
@@ -92,6 +92,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
48+
49+
tkz->pos = tkz->start;
50+
tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
51+
+ /* current_line & current_column already initialized by calloc (zero-based) */
52+
53+
tkz->tree = NULL;
54+
tkz->tags = NULL;
55+
@@ -153,6 +154,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
56+
tkz_to->start = tkz_from->start;
57+
tkz_to->end = tkz_from->end;
58+
tkz_to->pos = tkz_to->start;
59+
+ tkz_to->current_line = tkz_from->current_line;
60+
+ tkz_to->current_column = tkz_from->current_column;
61+
62+
return LXB_STATUS_OK;
63+
}
64+
@@ -571,7 +574,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
65+
tkz->last = end;
66+
67+
while (data < end) {
68+
- data = tkz->state(tkz, data, end);
69+
+ size_t current_column = tkz->current_column;
70+
+ const lxb_char_t *new_data = tkz->state(tkz, data, end);
71+
+ while (data < new_data) {
72+
+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
73+
+ if (*data == '\n') {
74+
+ tkz->current_line++;
75+
+ current_column = 0;
76+
+ } else {
77+
+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
78+
+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
79+
+ if ((*data & 0b11000000) == 0b10000000) {
80+
+ /* Continuation byte, do nothing */
81+
+ } else {
82+
+ /* First byte for a codepoint */
83+
+ current_column++;
84+
+ }
85+
+ }
86+
+ data++;
87+
+ }
88+
+ tkz->current_column = current_column;
89+
}
90+
91+
return tkz->status;
92+
diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h
93+
index 12b7c81..aa1ac37 100644
94+
--- a/source/lexbor/html/tokenizer.h
95+
+++ b/source/lexbor/html/tokenizer.h
96+
@@ -79,6 +79,8 @@ struct lxb_html_tokenizer {
97+
const lxb_char_t *end;
98+
const lxb_char_t *begin;
99+
const lxb_char_t *last;
100+
+ size_t current_line;
101+
+ size_t current_column;
102+
103+
/* Entities */
104+
const lexbor_sbst_entry_static_t *entity;
105+
diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h
106+
index 5e91444..52eaa9a 100644
107+
--- a/source/lexbor/html/tokenizer/state.h
108+
+++ b/source/lexbor/html/tokenizer/state.h
109+
@@ -90,6 +90,8 @@ extern "C" {
110+
do { \
111+
tkz->pos = tkz->start; \
112+
tkz->token->begin = v_begin; \
113+
+ tkz->token->line = tkz->current_line; \
114+
+ tkz->token->column = tkz->current_column; \
115+
} \
116+
while (0)
117+
118+
diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c
119+
index 062ea56..3f4c18d 100644
120+
--- a/source/lexbor/html/tree.c
121+
+++ b/source/lexbor/html/tree.c
122+
@@ -431,6 +431,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree,
123+
return NULL;
124+
}
125+
126+
+ node->line = token->line;
127+
+ /* We only expose line number in PHP DOM */
128+
+
129+
lxb_status_t status;
130+
lxb_dom_element_t *element = lxb_dom_interface_element(node);
131+
132+
@@ -767,6 +770,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree,
133+
134+
lxb_dom_interface_text(text)->char_data.data = *str;
135+
136+
+ if (tree->tkz_ref) {
137+
+ text->line = tree->tkz_ref->token->line;
138+
+ /* We only expose line number in PHP DOM */
139+
+ }
140+
+
141+
if (ret_node != NULL) {
142+
*ret_node = text;
143+
}
144+
@@ -806,6 +814,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree,
145+
return NULL;
146+
}
147+
148+
+ node->line = token->line;
149+
+ /* We only expose line number in PHP DOM */
150+
+
151+
tree->status = lxb_html_token_make_text(token, &comment->char_data.data,
152+
tree->document->dom_document.text);
153+
if (tree->status != LXB_STATUS_OK) {
154+
diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c
155+
index ffdc55c..ef36eab 100644
156+
--- a/source/lexbor/html/tree/error.c
157+
+++ b/source/lexbor/html/tree/error.c
158+
@@ -22,8 +22,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors,
159+
}
160+
161+
entry->id = id;
162+
- entry->begin = token->begin;
163+
- entry->end = token->end;
164+
+ entry->line = token->line;
165+
+ entry->column = token->column;
166+
+ entry->length = token->end - token->begin;
167+
168+
return entry;
169+
}
170+
diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h
171+
index 7a212af..b186772 100644
172+
--- a/source/lexbor/html/tree/error.h
173+
+++ b/source/lexbor/html/tree/error.h
174+
@@ -109,8 +109,9 @@ lxb_html_tree_error_id_t;
175+
176+
typedef struct {
177+
lxb_html_tree_error_id_t id;
178+
- const lxb_char_t *begin;
179+
- const lxb_char_t *end;
180+
+ size_t line;
181+
+ size_t column;
182+
+ size_t length;
183+
}
184+
lxb_html_tree_error_t;
185+
186+
--
187+
2.51.2
188+
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
From a4c29ba8d1ea1065ce6bd4a34382d53140cf1924 Mon Sep 17 00:00:00 2001
2+
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
3+
Date: Mon, 14 Aug 2023 20:18:51 +0200
4+
Subject: [PATCH 2/6] Track implied added nodes for options use in PHP
5+
6+
---
7+
source/lexbor/html/tree.h | 3 +++
8+
source/lexbor/html/tree/insertion_mode/after_head.c | 1 +
9+
source/lexbor/html/tree/insertion_mode/before_head.c | 2 ++
10+
source/lexbor/html/tree/insertion_mode/before_html.c | 2 ++
11+
4 files changed, 8 insertions(+)
12+
13+
diff --git a/source/lexbor/html/tree.h b/source/lexbor/html/tree.h
14+
index 4912efb..7b2c620 100644
15+
--- a/source/lexbor/html/tree.h
16+
+++ b/source/lexbor/html/tree.h
17+
@@ -55,6 +55,9 @@ struct lxb_html_tree {
18+
bool foster_parenting;
19+
bool frameset_ok;
20+
bool scripting;
21+
+ bool has_explicit_html_tag;
22+
+ bool has_explicit_head_tag;
23+
+ bool has_explicit_body_tag;
24+
25+
lxb_html_tree_insertion_mode_f mode;
26+
lxb_html_tree_insertion_mode_f original_mode;
27+
diff --git a/source/lexbor/html/tree/insertion_mode/after_head.c b/source/lexbor/html/tree/insertion_mode/after_head.c
28+
index ad551b5..1448654 100644
29+
--- a/source/lexbor/html/tree/insertion_mode/after_head.c
30+
+++ b/source/lexbor/html/tree/insertion_mode/after_head.c
31+
@@ -71,6 +71,7 @@ lxb_html_tree_insertion_mode_after_head_open(lxb_html_tree_t *tree,
32+
return lxb_html_tree_process_abort(tree);
33+
}
34+
35+
+ tree->has_explicit_body_tag = true;
36+
tree->frameset_ok = false;
37+
tree->mode = lxb_html_tree_insertion_mode_in_body;
38+
39+
diff --git a/source/lexbor/html/tree/insertion_mode/before_head.c b/source/lexbor/html/tree/insertion_mode/before_head.c
40+
index 14621f2..cd2ac2a 100644
41+
--- a/source/lexbor/html/tree/insertion_mode/before_head.c
42+
+++ b/source/lexbor/html/tree/insertion_mode/before_head.c
43+
@@ -67,6 +67,8 @@ lxb_html_tree_insertion_mode_before_head_open(lxb_html_tree_t *tree,
44+
return lxb_html_tree_process_abort(tree);
45+
}
46+
47+
+ tree->has_explicit_head_tag = true;
48+
+
49+
tree->mode = lxb_html_tree_insertion_mode_in_head;
50+
51+
break;
52+
diff --git a/source/lexbor/html/tree/insertion_mode/before_html.c b/source/lexbor/html/tree/insertion_mode/before_html.c
53+
index 05fe738..1e09cda 100644
54+
--- a/source/lexbor/html/tree/insertion_mode/before_html.c
55+
+++ b/source/lexbor/html/tree/insertion_mode/before_html.c
56+
@@ -78,6 +78,8 @@ lxb_html_tree_insertion_mode_before_html_open(lxb_html_tree_t *tree,
57+
return lxb_html_tree_process_abort(tree);
58+
}
59+
60+
+ tree->has_explicit_html_tag = true;
61+
+
62+
tree->mode = lxb_html_tree_insertion_mode_before_head;
63+
64+
break;
65+
--
66+
2.51.2
67+
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
From 46fc776449252e74795569759a19d13857a59069 Mon Sep 17 00:00:00 2001
2+
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
3+
Date: Thu, 24 Aug 2023 22:57:48 +0200
4+
Subject: [PATCH 3/6] Patch utilities and data structure to be able to generate
5+
smaller lookup tables
6+
7+
Changed the generation script to check if everything fits in 32-bits.
8+
And change the actual field types to 32-bits. This decreases the hash
9+
tables in size.
10+
---
11+
source/lexbor/core/shs.h | 4 ++--
12+
utils/lexbor/encoding/single-byte.py | 4 ++--
13+
utils/lexbor/lexbor/LXB.py | 12 +++++++++---
14+
3 files changed, 13 insertions(+), 7 deletions(-)
15+
16+
diff --git a/source/lexbor/core/shs.h b/source/lexbor/core/shs.h
17+
index 7a63a07..c84dfaa 100644
18+
--- a/source/lexbor/core/shs.h
19+
+++ b/source/lexbor/core/shs.h
20+
@@ -27,9 +27,9 @@ lexbor_shs_entry_t;
21+
22+
typedef struct {
23+
uint32_t key;
24+
- void *value;
25+
+ uint32_t value;
26+
27+
- size_t next;
28+
+ uint32_t next;
29+
}
30+
lexbor_shs_hash_t;
31+
32+
diff --git a/utils/lexbor/encoding/single-byte.py b/utils/lexbor/encoding/single-byte.py
33+
index d7d1bb2..5420c16 100755
34+
--- a/utils/lexbor/encoding/single-byte.py
35+
+++ b/utils/lexbor/encoding/single-byte.py
36+
@@ -128,7 +128,7 @@ class SingleByte:
37+
entries = values[idx]
38+
key_id = entries[1].decode('utf-8')
39+
40+
- hash_key.append(key_id, '(void *) {}'.format(idx + 0x80))
41+
+ hash_key.append(key_id, idx + 0x80)
42+
43+
return hash_key.create(rate = 1)
44+
45+
@@ -161,7 +161,7 @@ def toHex(s):
46+
lst = []
47+
48+
for ch in bytes(s, 'utf-8'):
49+
- hv = hex(ch).replace('0x', '\\\\x')
50+
+ hv = hex(ch).replace('0x', '\\x')
51+
lst.append("'{}'".format(hv))
52+
53+
return ', '.join(lst)
54+
diff --git a/utils/lexbor/lexbor/LXB.py b/utils/lexbor/lexbor/LXB.py
55+
index 3e75812..2370c66 100755
56+
--- a/utils/lexbor/lexbor/LXB.py
57+
+++ b/utils/lexbor/lexbor/LXB.py
58+
@@ -94,7 +94,7 @@ class HashKey:
59+
def append(self, key_id, value):
60+
self.buffer.append([self.hash_id(int(key_id, 0)), value])
61+
62+
- def create(self, terminate_value = '{0, NULL, 0}', rate = 2, is_const = True, data_before = None):
63+
+ def create(self, terminate_value = '{0, 0, 0}', rate = 2, is_const = True, data_before = None):
64+
test = self.test(int(self.max_table_size / 1.2), int(self.max_table_size * 1.2))
65+
66+
rate_dn = rate - 1
67+
@@ -142,9 +142,12 @@ class HashKey:
68+
entry = table[idx]
69+
70+
if entry:
71+
+ assert entry[0] < 2**32
72+
+ assert entry[1] < 2**32
73+
+ assert entry[2] < 2**32
74+
result.append("{{{}, {}, {}}},".format(entry[0], entry[1], entry[2]))
75+
else:
76+
- result.append("{0, NULL, 0},")
77+
+ result.append("{0, 0, 0},")
78+
79+
if int(idx) % rate == rate_dn:
80+
result.append("\n ")
81+
@@ -154,9 +157,12 @@ class HashKey:
82+
if len(table):
83+
entry = table[-1]
84+
if entry:
85+
+ assert entry[0] < 2**32
86+
+ assert entry[1] < 2**32
87+
+ assert entry[2] < 2**32
88+
result.append("{{{}, {}, {}}}\n".format(entry[0], entry[1], entry[2]))
89+
else:
90+
- result.append("{0, NULL, 0}\n")
91+
+ result.append("{0, 0, 0}\n")
92+
93+
result.append("};")
94+
95+
--
96+
2.51.2
97+

0 commit comments

Comments
 (0)