Skip to content

Commit 6cf94c7

Browse files
authored
Add more regexp_replace test coverage (#21485)
## Which issue does this PR close? - related to #21379 ## Rationale for this change While reviewing #21379 I noticed there was minimal Utf8View coverage of the related code. ## What changes are included in this PR? Update the regexp_replace tests to cover utf8, largeutf8, utf8view and dictionary ## Are these changes tested? Yes only tests I verified these tests also pass when run on - #21379 ## Are there any user-facing changes? No
1 parent 02e4411 commit 6cf94c7

File tree

1 file changed

+108
-37
lines changed

1 file changed

+108
-37
lines changed

datafusion/sqllogictest/test_files/regexp/regexp_replace.slt

Lines changed: 108 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -128,43 +128,6 @@ from (values ('a'), ('b')) as tbl(col);
128128
NULL NULL NULL
129129
NULL NULL NULL
130130

131-
# Extract domain from URL using anchored pattern with trailing .*
132-
# This tests that the full URL suffix is replaced, not just the matched prefix
133-
query T
134-
SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1') FROM (VALUES
135-
('https://www.example.com/path/to/page?q=1'),
136-
('http://test.org/foo/bar'),
137-
('https://example.com/'),
138-
('not-a-url')
139-
) AS t(url);
140-
----
141-
example.com
142-
test.org
143-
example.com
144-
not-a-url
145-
146-
# More than one capture group should disable the short-regex fast path.
147-
# This still uses replacement \1, but captures_len() will be > 2, so the
148-
# implementation must fall back to the normal regexp_replace path.
149-
query T
150-
SELECT regexp_replace(url, '^https?://((www\.)?([^/]+))/.*$', '\1') FROM (VALUES
151-
('https://www.example.com/path/to/page?q=1'),
152-
('http://test.org/foo/bar'),
153-
('not-a-url')
154-
) AS t(url);
155-
----
156-
www.example.com
157-
test.org
158-
not-a-url
159-
160-
# If the overall pattern matches but capture group 1 does not participate,
161-
# regexp_replace(..., '\1') should substitute the empty string, not keep
162-
# the original input.
163-
query B
164-
SELECT regexp_replace('bzzz', '^(a)?b.*$', '\1') = '';
165-
----
166-
true
167-
168131
# Stripping trailing .*$ must not change match semantics for inputs with
169132
# newlines when the original pattern does not use the 's' flag.
170133
query B
@@ -183,3 +146,111 @@ SELECT regexp_replace(
183146
) = concat('x', chr(10), 'rest');
184147
----
185148
true
149+
150+
151+
# Fixture for testing optimizations in regexp_replace
152+
statement ok
153+
CREATE TABLE regexp_replace_optimized_cases (
154+
value string,
155+
regexp string,
156+
replacement string,
157+
expected string
158+
);
159+
160+
# Extract domain from URL using anchored pattern with trailing .*
161+
# This tests that the full URL suffix is replaced, not just the matched prefix.
162+
statement ok
163+
INSERT INTO regexp_replace_optimized_cases VALUES
164+
('https://www.example.com/path/to/page?q=1', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'example.com'),
165+
('http://test.org/foo/bar', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'test.org'),
166+
('https://example.com/', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'example.com'),
167+
('not-a-url', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'not-a-url');
168+
169+
# More than one capture group should disable the short-regex fast path.
170+
# This still uses replacement \1, but captures_len() will be > 2, so the
171+
# implementation must fall back to the normal regexp_replace path.
172+
statement ok
173+
INSERT INTO regexp_replace_optimized_cases VALUES
174+
('https://www.example.com/path/to/page?q=1', '^https?://((www\.)?([^/]+))/.*$', '\1', 'www.example.com'),
175+
('http://test.org/foo/bar', '^https?://((www\.)?([^/]+))/.*$', '\1', 'test.org'),
176+
('not-a-url', '^https?://((www\.)?([^/]+))/.*$', '\1', 'not-a-url');
177+
178+
# If the overall pattern matches but capture group 1 does not participate,
179+
# regexp_replace(..., '\1') should substitute the empty string, not keep
180+
# the original input.
181+
statement ok
182+
INSERT INTO regexp_replace_optimized_cases VALUES
183+
('bzzz', '^(a)?b.*$', '\1', '');
184+
185+
186+
query TB
187+
SELECT value, regexp_replace(value, regexp, replacement) = expected
188+
FROM regexp_replace_optimized_cases
189+
ORDER BY regexp, value, replacement, expected;
190+
----
191+
bzzz true
192+
http://test.org/foo/bar true
193+
https://www.example.com/path/to/page?q=1 true
194+
not-a-url true
195+
http://test.org/foo/bar true
196+
https://example.com/ true
197+
https://www.example.com/path/to/page?q=1 true
198+
not-a-url true
199+
200+
query TB
201+
SELECT value, regexp_replace(
202+
arrow_cast(value, 'LargeUtf8'),
203+
arrow_cast(regexp, 'LargeUtf8'),
204+
arrow_cast(replacement, 'LargeUtf8')
205+
) = arrow_cast(expected, 'LargeUtf8')
206+
FROM regexp_replace_optimized_cases
207+
ORDER BY regexp, value, replacement, expected;
208+
----
209+
bzzz true
210+
http://test.org/foo/bar true
211+
https://www.example.com/path/to/page?q=1 true
212+
not-a-url true
213+
http://test.org/foo/bar true
214+
https://example.com/ true
215+
https://www.example.com/path/to/page?q=1 true
216+
not-a-url true
217+
218+
query TB
219+
SELECT value, regexp_replace(
220+
arrow_cast(value, 'Utf8View'),
221+
arrow_cast(regexp, 'Utf8View'),
222+
arrow_cast(replacement, 'Utf8View')
223+
) = arrow_cast(expected, 'Utf8View')
224+
FROM regexp_replace_optimized_cases
225+
ORDER BY regexp, value, replacement, expected;
226+
----
227+
bzzz true
228+
http://test.org/foo/bar true
229+
https://www.example.com/path/to/page?q=1 true
230+
not-a-url true
231+
http://test.org/foo/bar true
232+
https://example.com/ true
233+
https://www.example.com/path/to/page?q=1 true
234+
not-a-url true
235+
236+
query TB
237+
SELECT value, regexp_replace(
238+
arrow_cast(value, 'Dictionary(Int32, Utf8)'),
239+
arrow_cast(regexp, 'Dictionary(Int32, Utf8)'),
240+
arrow_cast(replacement, 'Dictionary(Int32, Utf8)')
241+
) = arrow_cast(expected, 'Dictionary(Int32, Utf8)')
242+
FROM regexp_replace_optimized_cases
243+
ORDER BY regexp, value, replacement, expected;
244+
----
245+
bzzz true
246+
http://test.org/foo/bar true
247+
https://www.example.com/path/to/page?q=1 true
248+
not-a-url true
249+
http://test.org/foo/bar true
250+
https://example.com/ true
251+
https://www.example.com/path/to/page?q=1 true
252+
not-a-url true
253+
254+
# cleanup
255+
statement ok
256+
DROP TABLE regexp_replace_optimized_cases;

0 commit comments

Comments
 (0)