Skip to content

Commit d0e479a

Browse files
committed
MDEV-39106 Add MySQL 8.0 extended syntax for REGEXP_INSTR
Extend REGEXP_INSTR to accept the full MySQL 8.0 signature: REGEXP_INSTR(subject, pattern [, pos [, occurrence [, return_option [, match_type]]]]) Previously only 2 arguments were accepted. Changes: - Switch Create_func_regexp_instr from Create_func_arg2 to Create_native_func to allow 2-6 arguments. - Add parse_match_type_flags() to Regexp_processor_pcre, which parses the match_type flags (c/i/m/n/u) and overwrites m_library_flags with the fully-resolved PCRE2 flag word. - Call parse_match_type_flags() in fix_length_and_dec() before fix_owner() when match_type is constant, so the pattern is compiled with the correct flags. For constant patterns fix_owner() sets m_is_const=true, making recompile() a permanent no-op. - For non-constant match_type, resolve flags per-row in val_int() and call compile() directly to bypass the recompile() no-op guard. - Add MTR test: regexp_instr_mysql8.test
1 parent b9bf38d commit d0e479a

5 files changed

Lines changed: 531 additions & 11 deletions

File tree

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#
2+
# MDEV-39520: Improve REGEXP_INSTR for MySQL 8.0 compatibility
3+
#
4+
# 1. Basic 2-argument form
5+
SELECT REGEXP_INSTR('abba', 'b{2}');
6+
REGEXP_INSTR('abba', 'b{2}')
7+
2
8+
SELECT REGEXP_INSTR('abba', 'x');
9+
REGEXP_INSTR('abba', 'x')
10+
0
11+
SELECT REGEXP_INSTR('hello world', 'world');
12+
REGEXP_INSTR('hello world', 'world')
13+
7
14+
SELECT REGEXP_INSTR('hello', '');
15+
REGEXP_INSTR('hello', '')
16+
1
17+
SELECT REGEXP_INSTR('', 'a');
18+
REGEXP_INSTR('', 'a')
19+
0
20+
SELECT REGEXP_INSTR('', '');
21+
REGEXP_INSTR('', '')
22+
1
23+
# 2. Three arguments: pos
24+
SELECT REGEXP_INSTR('abba', 'b{2}', 2);
25+
REGEXP_INSTR('abba', 'b{2}', 2)
26+
2
27+
SELECT REGEXP_INSTR('abba', 'b{2}', 3);
28+
REGEXP_INSTR('abba', 'b{2}', 3)
29+
0
30+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1);
31+
REGEXP_INSTR('abbabba', 'b{2}', 1)
32+
2
33+
SELECT REGEXP_INSTR('aabba', 'b', 3);
34+
REGEXP_INSTR('aabba', 'b', 3)
35+
3
36+
SELECT REGEXP_INSTR('xyzabc', 'abc', 4);
37+
REGEXP_INSTR('xyzabc', 'abc', 4)
38+
4
39+
SELECT REGEXP_INSTR('abc', 'c', 3);
40+
REGEXP_INSTR('abc', 'c', 3)
41+
3
42+
SELECT REGEXP_INSTR('abc', 'c', 4);
43+
REGEXP_INSTR('abc', 'c', 4)
44+
0
45+
# 3. Four arguments: occurrence
46+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 1);
47+
REGEXP_INSTR('abbabba', 'b{2}', 1, 1)
48+
2
49+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 2);
50+
REGEXP_INSTR('abbabba', 'b{2}', 1, 2)
51+
5
52+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 3);
53+
REGEXP_INSTR('abbabba', 'b{2}', 1, 3)
54+
0
55+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 2, 1);
56+
REGEXP_INSTR('abbabba', 'b{2}', 2, 1)
57+
2
58+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 2, 2);
59+
REGEXP_INSTR('abbabba', 'b{2}', 2, 2)
60+
5
61+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 3, 2);
62+
REGEXP_INSTR('abbabba', 'b{2}', 3, 2)
63+
0
64+
SELECT REGEXP_INSTR('aaa', 'a', 1, 1);
65+
REGEXP_INSTR('aaa', 'a', 1, 1)
66+
1
67+
SELECT REGEXP_INSTR('aaa', 'a', 1, 2);
68+
REGEXP_INSTR('aaa', 'a', 1, 2)
69+
2
70+
SELECT REGEXP_INSTR('aaa', 'a', 1, 3);
71+
REGEXP_INSTR('aaa', 'a', 1, 3)
72+
3
73+
SELECT REGEXP_INSTR('aaa', 'a', 1, 4);
74+
REGEXP_INSTR('aaa', 'a', 1, 4)
75+
0
76+
# 4. Five arguments: return_option
77+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 1, 0);
78+
REGEXP_INSTR('abbabba', 'b{2}', 1, 1, 0)
79+
2
80+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 1, 1);
81+
REGEXP_INSTR('abbabba', 'b{2}', 1, 1, 1)
82+
4
83+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 2, 0);
84+
REGEXP_INSTR('abbabba', 'b{2}', 1, 2, 0)
85+
5
86+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 2, 1);
87+
REGEXP_INSTR('abbabba', 'b{2}', 1, 2, 1)
88+
7
89+
SELECT REGEXP_INSTR('abcabc', 'b', 1, 1, 0);
90+
REGEXP_INSTR('abcabc', 'b', 1, 1, 0)
91+
2
92+
SELECT REGEXP_INSTR('abcabc', 'b', 1, 1, 1);
93+
REGEXP_INSTR('abcabc', 'b', 1, 1, 1)
94+
3
95+
SELECT REGEXP_INSTR('abcabc', 'b', 1, 2, 0);
96+
REGEXP_INSTR('abcabc', 'b', 1, 2, 0)
97+
5
98+
SELECT REGEXP_INSTR('abcabc', 'b', 1, 2, 1);
99+
REGEXP_INSTR('abcabc', 'b', 1, 2, 1)
100+
6
101+
SELECT REGEXP_INSTR('abcabc', 'z', 1, 1, 1);
102+
REGEXP_INSTR('abcabc', 'z', 1, 1, 1)
103+
0
104+
# 5. Six arguments: match_type
105+
SELECT REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'i');
106+
REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'i')
107+
1
108+
SELECT REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'c');
109+
REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'c')
110+
0
111+
SELECT REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'ci');
112+
REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'ci')
113+
1
114+
SELECT REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'ic');
115+
REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'ic')
116+
0
117+
SELECT REGEXP_INSTR('a\nb\nc', '^b$', 1, 1, 0, 'm');
118+
REGEXP_INSTR('a\nb\nc', '^b$', 1, 1, 0, 'm')
119+
3
120+
SELECT REGEXP_INSTR('a\nb\nc', '^b$', 1, 1, 0, '');
121+
REGEXP_INSTR('a\nb\nc', '^b$', 1, 1, 0, '')
122+
0
123+
SELECT REGEXP_INSTR('a\nb\nc', 'a.b', 1, 1, 0, 'n');
124+
REGEXP_INSTR('a\nb\nc', 'a.b', 1, 1, 0, 'n')
125+
1
126+
SELECT REGEXP_INSTR('a\nb\nc', 'a.b', 1, 1, 0, '');
127+
REGEXP_INSTR('a\nb\nc', 'a.b', 1, 1, 0, '')
128+
0
129+
SELECT REGEXP_INSTR('a\nb', '^b$', 1, 1, 0, 'mu');
130+
REGEXP_INSTR('a\nb', '^b$', 1, 1, 0, 'mu')
131+
3
132+
SELECT REGEXP_INSTR('a\nB\nc', '^b$', 1, 1, 0, 'im');
133+
REGEXP_INSTR('a\nB\nc', '^b$', 1, 1, 0, 'im')
134+
3
135+
# 6. Multibyte characters
136+
SET NAMES utf8mb4;
137+
SELECT REGEXP_INSTR('áéí', 'é');
138+
REGEXP_INSTR('áéí', 'é')
139+
2
140+
SELECT REGEXP_INSTR('áéí', 'í');
141+
REGEXP_INSTR('áéí', 'í')
142+
3
143+
SELECT REGEXP_INSTR('αβγδ', 'γ');
144+
REGEXP_INSTR('αβγδ', 'γ')
145+
3
146+
SELECT REGEXP_INSTR('áéíó', 'í', 2);
147+
REGEXP_INSTR('áéíó', 'í', 2)
148+
3
149+
SELECT REGEXP_INSTR('αβγδ', 'β', 2);
150+
REGEXP_INSTR('αβγδ', 'β', 2)
151+
2
152+
SELECT REGEXP_INSTR('αβγδ', 'β', 1, 1, 1);
153+
REGEXP_INSTR('αβγδ', 'β', 1, 1, 1)
154+
3
155+
# 7. NULL propagation
156+
SELECT REGEXP_INSTR(NULL, 'a');
157+
REGEXP_INSTR(NULL, 'a')
158+
NULL
159+
SELECT REGEXP_INSTR('abc', NULL);
160+
REGEXP_INSTR('abc', NULL)
161+
NULL
162+
SELECT REGEXP_INSTR('abc', 'a', NULL);
163+
REGEXP_INSTR('abc', 'a', NULL)
164+
NULL
165+
SELECT REGEXP_INSTR('abc', 'a', 1, NULL);
166+
REGEXP_INSTR('abc', 'a', 1, NULL)
167+
NULL
168+
SELECT REGEXP_INSTR('abc', 'a', 1, 1, NULL);
169+
REGEXP_INSTR('abc', 'a', 1, 1, NULL)
170+
NULL
171+
SELECT REGEXP_INSTR('abc', 'a', 1, 1, 0, NULL);
172+
REGEXP_INSTR('abc', 'a', 1, 1, 0, NULL)
173+
NULL
174+
# 8. Edge cases
175+
SELECT REGEXP_INSTR('abc', '', 1, 1);
176+
REGEXP_INSTR('abc', '', 1, 1)
177+
1
178+
SELECT REGEXP_INSTR('abc', '', 1, 2);
179+
REGEXP_INSTR('abc', '', 1, 2)
180+
2
181+
SELECT REGEXP_INSTR('abc', '', 1, 3);
182+
REGEXP_INSTR('abc', '', 1, 3)
183+
3
184+
SELECT REGEXP_INSTR('abc', '', 1, 4);
185+
REGEXP_INSTR('abc', '', 1, 4)
186+
4
187+
SELECT REGEXP_INSTR('abc', '', 1, 5);
188+
REGEXP_INSTR('abc', '', 1, 5)
189+
0
190+
SELECT REGEXP_INSTR('abcabc', '^abc');
191+
REGEXP_INSTR('abcabc', '^abc')
192+
1
193+
SELECT REGEXP_INSTR('abcabc', 'abc$');
194+
REGEXP_INSTR('abcabc', 'abc$')
195+
4
196+
SELECT REGEXP_INSTR('abcabc', '^abc$');
197+
REGEXP_INSTR('abcabc', '^abc$')
198+
0
199+
SELECT REGEXP_INSTR('foo bar baz', 'bar|baz', 1, 1);
200+
REGEXP_INSTR('foo bar baz', 'bar|baz', 1, 1)
201+
5
202+
SELECT REGEXP_INSTR('foo bar baz', 'bar|baz', 1, 2);
203+
REGEXP_INSTR('foo bar baz', 'bar|baz', 1, 2)
204+
9
205+
SELECT REGEXP_INSTR('aabbaabb', '(a+)(b+)\\1', 1, 1);
206+
REGEXP_INSTR('aabbaabb', '(a+)(b+)\\1', 1, 1)
207+
1
208+
SELECT REGEXP_INSTR('aabbaabb', '(a+)(b+)\\1', 1, 2);
209+
REGEXP_INSTR('aabbaabb', '(a+)(b+)\\1', 1, 2)
210+
0
211+
SELECT REGEXP_INSTR(REPEAT('x', 1000), 'x{5}', 1, 1);
212+
REGEXP_INSTR(REPEAT('x', 1000), 'x{5}', 1, 1)
213+
1
214+
SELECT REGEXP_INSTR(REPEAT('x', 1000), 'x{5}', 1, 200);
215+
REGEXP_INSTR(REPEAT('x', 1000), 'x{5}', 1, 200)
216+
996
217+
SELECT REGEXP_INSTR('Test-abc-abc-abc', 'AB', 1, 3, 0);
218+
REGEXP_INSTR('Test-abc-abc-abc', 'AB', 1, 3, 0)
219+
14
220+
SELECT REGEXP_INSTR('Test-abc-abc-abc', 'AB', 1, 3, 0, 'c');
221+
REGEXP_INSTR('Test-abc-abc-abc', 'AB', 1, 3, 0, 'c')
222+
0
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
--echo #
2+
--echo # MDEV-39520: Improve REGEXP_INSTR for MySQL 8.0 compatibility
3+
--echo #
4+
5+
--echo # 1. Basic 2-argument form
6+
7+
SELECT REGEXP_INSTR('abba', 'b{2}');
8+
SELECT REGEXP_INSTR('abba', 'x');
9+
SELECT REGEXP_INSTR('hello world', 'world');
10+
SELECT REGEXP_INSTR('hello', '');
11+
SELECT REGEXP_INSTR('', 'a');
12+
SELECT REGEXP_INSTR('', '');
13+
14+
--echo # 2. Three arguments: pos
15+
16+
SELECT REGEXP_INSTR('abba', 'b{2}', 2);
17+
SELECT REGEXP_INSTR('abba', 'b{2}', 3);
18+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1);
19+
SELECT REGEXP_INSTR('aabba', 'b', 3);
20+
SELECT REGEXP_INSTR('xyzabc', 'abc', 4);
21+
SELECT REGEXP_INSTR('abc', 'c', 3);
22+
SELECT REGEXP_INSTR('abc', 'c', 4);
23+
24+
--echo # 3. Four arguments: occurrence
25+
26+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 1);
27+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 2);
28+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 3);
29+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 2, 1);
30+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 2, 2);
31+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 3, 2);
32+
33+
# Repeating pattern
34+
SELECT REGEXP_INSTR('aaa', 'a', 1, 1);
35+
SELECT REGEXP_INSTR('aaa', 'a', 1, 2);
36+
SELECT REGEXP_INSTR('aaa', 'a', 1, 3);
37+
SELECT REGEXP_INSTR('aaa', 'a', 1, 4);
38+
39+
--echo # 4. Five arguments: return_option
40+
41+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 1, 0);
42+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 1, 1);
43+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 2, 0);
44+
SELECT REGEXP_INSTR('abbabba', 'b{2}', 1, 2, 1);
45+
46+
SELECT REGEXP_INSTR('abcabc', 'b', 1, 1, 0);
47+
SELECT REGEXP_INSTR('abcabc', 'b', 1, 1, 1);
48+
SELECT REGEXP_INSTR('abcabc', 'b', 1, 2, 0);
49+
SELECT REGEXP_INSTR('abcabc', 'b', 1, 2, 1);
50+
SELECT REGEXP_INSTR('abcabc', 'z', 1, 1, 1);
51+
52+
--echo # 5. Six arguments: match_type
53+
54+
SELECT REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'i');
55+
SELECT REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'c');
56+
SELECT REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'ci');
57+
SELECT REGEXP_INSTR('Abba', 'ABBA', 1, 1, 0, 'ic');
58+
SELECT REGEXP_INSTR('a\nb\nc', '^b$', 1, 1, 0, 'm');
59+
SELECT REGEXP_INSTR('a\nb\nc', '^b$', 1, 1, 0, '');
60+
SELECT REGEXP_INSTR('a\nb\nc', 'a.b', 1, 1, 0, 'n');
61+
SELECT REGEXP_INSTR('a\nb\nc', 'a.b', 1, 1, 0, '');
62+
SELECT REGEXP_INSTR('a\nb', '^b$', 1, 1, 0, 'mu');
63+
SELECT REGEXP_INSTR('a\nB\nc', '^b$', 1, 1, 0, 'im');
64+
65+
--echo # 6. Multibyte characters
66+
67+
SET NAMES utf8mb4;
68+
69+
SELECT REGEXP_INSTR('áéí', 'é');
70+
SELECT REGEXP_INSTR('áéí', 'í');
71+
SELECT REGEXP_INSTR('αβγδ', 'γ');
72+
73+
SELECT REGEXP_INSTR('áéíó', 'í', 2);
74+
SELECT REGEXP_INSTR('αβγδ', 'β', 2);
75+
76+
SELECT REGEXP_INSTR('αβγδ', 'β', 1, 1, 1);
77+
78+
--echo # 7. NULL propagation
79+
80+
SELECT REGEXP_INSTR(NULL, 'a');
81+
SELECT REGEXP_INSTR('abc', NULL);
82+
SELECT REGEXP_INSTR('abc', 'a', NULL);
83+
SELECT REGEXP_INSTR('abc', 'a', 1, NULL);
84+
SELECT REGEXP_INSTR('abc', 'a', 1, 1, NULL);
85+
SELECT REGEXP_INSTR('abc', 'a', 1, 1, 0, NULL);
86+
87+
--echo # 8. Edge cases
88+
89+
# Zero-length match
90+
SELECT REGEXP_INSTR('abc', '', 1, 1);
91+
SELECT REGEXP_INSTR('abc', '', 1, 2);
92+
SELECT REGEXP_INSTR('abc', '', 1, 3);
93+
SELECT REGEXP_INSTR('abc', '', 1, 4);
94+
SELECT REGEXP_INSTR('abc', '', 1, 5);
95+
96+
# Anchored patterns
97+
SELECT REGEXP_INSTR('abcabc', '^abc');
98+
SELECT REGEXP_INSTR('abcabc', 'abc$');
99+
SELECT REGEXP_INSTR('abcabc', '^abc$');
100+
101+
# Alternation
102+
SELECT REGEXP_INSTR('foo bar baz', 'bar|baz', 1, 1);
103+
SELECT REGEXP_INSTR('foo bar baz', 'bar|baz', 1, 2);
104+
105+
# Back-references
106+
SELECT REGEXP_INSTR('aabbaabb', '(a+)(b+)\\1', 1, 1);
107+
SELECT REGEXP_INSTR('aabbaabb', '(a+)(b+)\\1', 1, 2);
108+
109+
# Very long subject
110+
SELECT REGEXP_INSTR(REPEAT('x', 1000), 'x{5}', 1, 1);
111+
SELECT REGEXP_INSTR(REPEAT('x', 1000), 'x{5}', 1, 200);
112+
113+
# case-sensitive vs collation default
114+
SELECT REGEXP_INSTR('Test-abc-abc-abc', 'AB', 1, 3, 0);
115+
SELECT REGEXP_INSTR('Test-abc-abc-abc', 'AB', 1, 3, 0, 'c');

0 commit comments

Comments
 (0)