@@ -128,43 +128,6 @@ from (values ('a'), ('b')) as tbl(col);
128128NULL NULL NULL
129129NULL NULL NULL
130130
131- # Extract domain from URL using anchored pattern with trailing .*
132- # This tests that the full URL suffix is replaced, not just the matched prefix
133- query T
134- SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1') FROM (VALUES
135- ('https://www.example.com/path/to/page?q=1'),
136- ('http://test.org/foo/bar'),
137- ('https://example.com/'),
138- ('not-a-url')
139- ) AS t(url);
140- ----
141- example.com
142- test.org
143- example.com
144- not-a-url
145-
146- # More than one capture group should disable the short-regex fast path.
147- # This still uses replacement \1, but captures_len() will be > 2, so the
148- # implementation must fall back to the normal regexp_replace path.
149- query T
150- SELECT regexp_replace(url, '^https?://((www\.)?([^/]+))/.*$', '\1') FROM (VALUES
151- ('https://www.example.com/path/to/page?q=1'),
152- ('http://test.org/foo/bar'),
153- ('not-a-url')
154- ) AS t(url);
155- ----
156- www.example.com
157- test.org
158- not-a-url
159-
160- # If the overall pattern matches but capture group 1 does not participate,
161- # regexp_replace(..., '\1') should substitute the empty string, not keep
162- # the original input.
163- query B
164- SELECT regexp_replace('bzzz', '^(a)?b.*$', '\1') = '';
165- ----
166- true
167-
168131# Stripping trailing .*$ must not change match semantics for inputs with
169132# newlines when the original pattern does not use the 's' flag.
170133query B
@@ -183,3 +146,111 @@ SELECT regexp_replace(
183146 ) = concat('x', chr(10), 'rest');
184147----
185148true
149+
150+
151+ # Fixture for testing optimizations in regexp_replace
152+ statement ok
153+ CREATE TABLE regexp_replace_optimized_cases (
154+ value string,
155+ regexp string,
156+ replacement string,
157+ expected string
158+ );
159+
160+ # Extract domain from URL using anchored pattern with trailing .*
161+ # This tests that the full URL suffix is replaced, not just the matched prefix.
162+ statement ok
163+ INSERT INTO regexp_replace_optimized_cases VALUES
164+ ('https://www.example.com/path/to/page?q=1', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'example.com'),
165+ ('http://test.org/foo/bar', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'test.org'),
166+ ('https://example.com/', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'example.com'),
167+ ('not-a-url', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'not-a-url');
168+
169+ # More than one capture group should disable the short-regex fast path.
170+ # This still uses replacement \1, but captures_len() will be > 2, so the
171+ # implementation must fall back to the normal regexp_replace path.
172+ statement ok
173+ INSERT INTO regexp_replace_optimized_cases VALUES
174+ ('https://www.example.com/path/to/page?q=1', '^https?://((www\.)?([^/]+))/.*$', '\1', 'www.example.com'),
175+ ('http://test.org/foo/bar', '^https?://((www\.)?([^/]+))/.*$', '\1', 'test.org'),
176+ ('not-a-url', '^https?://((www\.)?([^/]+))/.*$', '\1', 'not-a-url');
177+
178+ # If the overall pattern matches but capture group 1 does not participate,
179+ # regexp_replace(..., '\1') should substitute the empty string, not keep
180+ # the original input.
181+ statement ok
182+ INSERT INTO regexp_replace_optimized_cases VALUES
183+ ('bzzz', '^(a)?b.*$', '\1', '');
184+
185+
186+ query TB
187+ SELECT value, regexp_replace(value, regexp, replacement) = expected
188+ FROM regexp_replace_optimized_cases
189+ ORDER BY regexp, value, replacement, expected;
190+ ----
191+ bzzz true
192+ http://test.org/foo/bar true
193+ https://www.example.com/path/to/page?q=1 true
194+ not-a-url true
195+ http://test.org/foo/bar true
196+ https://example.com/ true
197+ https://www.example.com/path/to/page?q=1 true
198+ not-a-url true
199+
200+ query TB
201+ SELECT value, regexp_replace(
202+ arrow_cast(value, 'LargeUtf8'),
203+ arrow_cast(regexp, 'LargeUtf8'),
204+ arrow_cast(replacement, 'LargeUtf8')
205+ ) = arrow_cast(expected, 'LargeUtf8')
206+ FROM regexp_replace_optimized_cases
207+ ORDER BY regexp, value, replacement, expected;
208+ ----
209+ bzzz true
210+ http://test.org/foo/bar true
211+ https://www.example.com/path/to/page?q=1 true
212+ not-a-url true
213+ http://test.org/foo/bar true
214+ https://example.com/ true
215+ https://www.example.com/path/to/page?q=1 true
216+ not-a-url true
217+
218+ query TB
219+ SELECT value, regexp_replace(
220+ arrow_cast(value, 'Utf8View'),
221+ arrow_cast(regexp, 'Utf8View'),
222+ arrow_cast(replacement, 'Utf8View')
223+ ) = arrow_cast(expected, 'Utf8View')
224+ FROM regexp_replace_optimized_cases
225+ ORDER BY regexp, value, replacement, expected;
226+ ----
227+ bzzz true
228+ http://test.org/foo/bar true
229+ https://www.example.com/path/to/page?q=1 true
230+ not-a-url true
231+ http://test.org/foo/bar true
232+ https://example.com/ true
233+ https://www.example.com/path/to/page?q=1 true
234+ not-a-url true
235+
236+ query TB
237+ SELECT value, regexp_replace(
238+ arrow_cast(value, 'Dictionary(Int32, Utf8)'),
239+ arrow_cast(regexp, 'Dictionary(Int32, Utf8)'),
240+ arrow_cast(replacement, 'Dictionary(Int32, Utf8)')
241+ ) = arrow_cast(expected, 'Dictionary(Int32, Utf8)')
242+ FROM regexp_replace_optimized_cases
243+ ORDER BY regexp, value, replacement, expected;
244+ ----
245+ bzzz true
246+ http://test.org/foo/bar true
247+ https://www.example.com/path/to/page?q=1 true
248+ not-a-url true
249+ http://test.org/foo/bar true
250+ https://example.com/ true
251+ https://www.example.com/path/to/page?q=1 true
252+ not-a-url true
253+
254+ # cleanup
255+ statement ok
256+ DROP TABLE regexp_replace_optimized_cases;
0 commit comments