Skip to content

Commit fc8f23d

Browse files
committed
[Feature](func) Support REGEXP_EXTRACT_ALL_ARRAY
1 parent 2b1803a commit fc8f23d

File tree

7 files changed

+429
-138
lines changed

7 files changed

+429
-138
lines changed

be/src/exprs/function/function_regexp.cpp

Lines changed: 216 additions & 138 deletions
Large diffs are not rendered by default.

be/test/exprs/function/function_like_test.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
#include "core/column/column_string.h"
2222
#include "core/column/column_vector.h"
23+
#include "core/data_type/data_type_array.h"
2324
#include "core/data_type/data_type_nullable.h"
2425
#include "core/data_type/data_type_number.h"
2526
#include "core/data_type/data_type_string.h"
@@ -248,6 +249,63 @@ TEST(FunctionLikeTest, regexp_extract_all) {
248249
}
249250
}
250251

252+
TEST(FunctionLikeTest, regexp_extract_all_array) {
253+
std::string func_name = "regexp_extract_all_array";
254+
255+
// Build return type: Nullable(Array(Nullable(String)))
256+
auto return_type = make_nullable(
257+
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>())));
258+
auto str_type = std::make_shared<DataTypeString>();
259+
260+
// Test case: 'x=a3&x=18abc&x=2&y=3&x=4&x=17bcd' with pattern 'x=([0-9]+)([a-z]+)'
261+
// Expected: ["18","17"] (first capturing group from each match)
262+
{
263+
auto col_str = ColumnString::create();
264+
col_str->insert_data("x=a3&x=18abc&x=2&y=3&x=4&x=17bcd", 35);
265+
auto col_pattern = ColumnString::create();
266+
col_pattern->insert_data("x=([0-9]+)([a-z]+)", 18);
267+
auto const_pattern = ColumnConst::create(std::move(col_pattern), 1);
268+
269+
Block block;
270+
block.insert({std::move(col_str), str_type, "str"});
271+
block.insert({std::move(const_pattern), str_type, "pattern"});
272+
block.insert({nullptr, return_type, "result"});
273+
274+
ColumnNumbers arguments = {0, 1};
275+
std::vector<DataTypePtr> arg_types = {str_type, str_type};
276+
std::vector<std::shared_ptr<ColumnPtrWrapper>> constant_cols;
277+
constant_cols.push_back(nullptr);
278+
constant_cols.push_back(
279+
std::make_shared<ColumnPtrWrapper>(block.get_by_position(1).column));
280+
281+
FunctionUtils fn_utils({}, arg_types, false);
282+
auto* fn_ctx = fn_utils.get_fn_ctx();
283+
fn_ctx->set_constant_cols(constant_cols);
284+
285+
auto func = SimpleFunctionFactory::instance().get_function(
286+
func_name, block.get_columns_with_type_and_name(), return_type);
287+
ASSERT_TRUE(func != nullptr);
288+
ASSERT_EQ(Status::OK(), func->open(fn_ctx, FunctionContext::FRAGMENT_LOCAL));
289+
ASSERT_EQ(Status::OK(), func->open(fn_ctx, FunctionContext::THREAD_LOCAL));
290+
291+
auto st = func->execute(fn_ctx, block, arguments, 2, 1);
292+
ASSERT_EQ(Status::OK(), st);
293+
294+
auto result_col = block.get_by_position(2).column;
295+
ASSERT_TRUE(result_col.get() != nullptr);
296+
// Result should not be null
297+
ASSERT_FALSE(result_col->is_null_at(0));
298+
// Check the string representation of the result
299+
auto result_str = return_type->to_string(*result_col, 0);
300+
EXPECT_TRUE(result_str == "[\"18\", \"17\"]" || result_str == "[\"18\",\"17\"]" ||
301+
result_str == "[18, 17]")
302+
<< "Unexpected result: " << result_str;
303+
304+
static_cast<void>(func->close(fn_ctx, FunctionContext::THREAD_LOCAL));
305+
static_cast<void>(func->close(fn_ctx, FunctionContext::FRAGMENT_LOCAL));
306+
}
307+
}
308+
251309
TEST(FunctionLikeTest, regexp_replace) {
252310
std::string func_name = "regexp_replace";
253311

fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@
416416
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpCount;
417417
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
418418
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
419+
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAllArray;
419420
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
420421
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace;
421422
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne;
@@ -986,6 +987,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
986987
scalar(RegexpCount.class, "regexp_count"),
987988
scalar(RegexpExtract.class, "regexp_extract"),
988989
scalar(RegexpExtractAll.class, "regexp_extract_all"),
990+
scalar(RegexpExtractAllArray.class, "regexp_extract_all_array"),
989991
scalar(RegexpExtractOrNull.class, "regexp_extract_or_null"),
990992
scalar(RegexpReplace.class, "regexp_replace"),
991993
scalar(RegexpReplaceOne.class, "regexp_replace_one"),
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package org.apache.doris.nereids.trees.expressions.functions.scalar;
19+
20+
import org.apache.doris.catalog.FunctionSignature;
21+
import org.apache.doris.nereids.trees.expressions.Expression;
22+
import org.apache.doris.nereids.trees.expressions.functions.AlwaysNullable;
23+
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
24+
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullLiteral;
25+
import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
26+
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
27+
import org.apache.doris.nereids.types.ArrayType;
28+
import org.apache.doris.nereids.types.StringType;
29+
import org.apache.doris.nereids.types.VarcharType;
30+
31+
import com.google.common.base.Preconditions;
32+
import com.google.common.collect.ImmutableList;
33+
34+
import java.util.List;
35+
36+
/**
37+
* ScalarFunction 'regexp_extract_all_array'.
38+
* Returns all matches of a regex pattern as an Array&lt;String&gt; instead of a string-formatted array.
39+
*/
40+
public class RegexpExtractAllArray extends ScalarFunction
41+
implements BinaryExpression, ExplicitlyCastableSignature, AlwaysNullable, PropagateNullLiteral {
42+
43+
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
44+
FunctionSignature.ret(ArrayType.of(VarcharType.SYSTEM_DEFAULT))
45+
.args(VarcharType.SYSTEM_DEFAULT, VarcharType.SYSTEM_DEFAULT),
46+
FunctionSignature.ret(ArrayType.of(StringType.INSTANCE))
47+
.args(StringType.INSTANCE, StringType.INSTANCE)
48+
);
49+
50+
/**
51+
* constructor with 2 arguments.
52+
*/
53+
public RegexpExtractAllArray(Expression arg0, Expression arg1) {
54+
super("regexp_extract_all_array", arg0, arg1);
55+
}
56+
57+
/** constructor for withChildren and reuse signature */
58+
private RegexpExtractAllArray(ScalarFunctionParams functionParams) {
59+
super(functionParams);
60+
}
61+
62+
/**
63+
* withChildren.
64+
*/
65+
@Override
66+
public RegexpExtractAllArray withChildren(List<Expression> children) {
67+
Preconditions.checkArgument(children.size() == 2);
68+
return new RegexpExtractAllArray(getFunctionParams(children));
69+
}
70+
71+
@Override
72+
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
73+
return visitor.visitRegexpExtractAllArray(this, context);
74+
}
75+
76+
@Override
77+
public List<FunctionSignature> getSignatures() {
78+
return SIGNATURES;
79+
}
80+
}

fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@
437437
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpCount;
438438
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
439439
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
440+
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAllArray;
440441
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
441442
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace;
442443
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne;
@@ -2160,6 +2161,10 @@ default R visitRegexpExtractAll(RegexpExtractAll regexpExtractAll, C context) {
21602161
return visitScalarFunction(regexpExtractAll, context);
21612162
}
21622163

2164+
default R visitRegexpExtractAllArray(RegexpExtractAllArray regexpExtractAllArray, C context) {
2165+
return visitScalarFunction(regexpExtractAllArray, context);
2166+
}
2167+
21632168
default R visitRegexpExtractOrNull(RegexpExtractOrNull regexpExtractOrNull, C context) {
21642169
return visitScalarFunction(regexpExtractOrNull, context);
21652170
}

regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,62 @@ aXb
262262
-- !sql_regexp_extract_all_10 --
263263
['aXb','cXd']
264264

265+
-- !regexp_extract_all_array_1 --
266+
["18", "17"]
267+
268+
-- !regexp_extract_all_array_2 --
269+
["41915", "73660"]
270+
271+
-- !regexp_extract_all_array_3 --
272+
["abc", "def", "ghi"]
273+
274+
-- !regexp_extract_all_array_4 --
275+
[]
276+
277+
-- !regexp_extract_all_array_5 --
278+
\N
279+
280+
-- !regexp_extract_all_array_6 --
281+
\N
282+
283+
-- !regexp_extract_all_array_7 --
284+
["ab", "c", "c", "c"]
285+
286+
-- !regexp_extract_all_array_8 --
287+
\N
288+
[]
289+
["Emmy", "eillish"]
290+
["It", "s", "ok"]
291+
["It", "s", "true"]
292+
["billie", "eillish"]
293+
["billie", "eillish"]
294+
295+
-- !regexp_extract_all_array_9 --
296+
\N
297+
[]
298+
["mmy", "eillish"]
299+
["t", "s", "ok"]
300+
["t", "s", "true"]
301+
["billie", "eillish"]
302+
["billie", "eillish"]
303+
304+
-- !regexp_extract_all_array_10 --
305+
\N 5 \N
306+
6 []
307+
Emmy eillish 3 ["Emmy", "eillish"]
308+
It's ok 2 ["It", "s", "ok"]
309+
It's true 4 ["It", "s", "true"]
310+
billie eillish \N ["billie", "eillish"]
311+
billie eillish 1 ["billie", "eillish"]
312+
313+
-- !regexp_extract_all_array_11 --
314+
[]
315+
[]
316+
[]
317+
[]
318+
[]
319+
[]
320+
265321
-- !sql --
266322
a-b-c
267323

regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,18 @@ suite("test_string_function_regexp") {
128128
qt_sql_regexp_extract_all_9 "SELECT REGEXP_EXTRACT_ALL(concat('aXb', char(10), 'cXd'), '(?-s)(\\\\w.\\\\w)');"
129129
qt_sql_regexp_extract_all_10 "SELECT REGEXP_EXTRACT_ALL(concat('aXb', char(10), 'cXd'), '(\\\\w.\\\\w)');"
130130

131+
qt_regexp_extract_all_array_1 "SELECT regexp_extract_all_array('x=a3&x=18abc&x=2&y=3&x=4&x=17bcd', 'x=([0-9]+)([a-z]+)');"
132+
qt_regexp_extract_all_array_2 "SELECT regexp_extract_all_array('http://a.m.baidu.com/i41915i73660.htm', 'i([0-9]+)');"
133+
qt_regexp_extract_all_array_3 "SELECT regexp_extract_all_array('abc=111, def=222, ghi=333', '(\"[^\"]+\"|\\\\w+)=(\"[^\"]+\"|\\\\w+)');"
134+
qt_regexp_extract_all_array_4 "select regexp_extract_all_array('xxfs','f');"
135+
qt_regexp_extract_all_array_5 "select regexp_extract_all_array(NULL, 'pattern');"
136+
qt_regexp_extract_all_array_6 "select regexp_extract_all_array('text', NULL);"
137+
qt_regexp_extract_all_array_7 "select regexp_extract_all_array('abcdfesscca', '(ab|c|)');"
138+
qt_regexp_extract_all_array_8 "SELECT regexp_extract_all_array(k, '(\\\\w+)') from test_string_function_regexp ORDER BY k;"
139+
qt_regexp_extract_all_array_9 "SELECT regexp_extract_all_array(k, '([a-z]+)') from test_string_function_regexp ORDER BY k;"
140+
qt_regexp_extract_all_array_10 "SELECT k, v, regexp_extract_all_array(k, '(\\\\w+)') from test_string_function_regexp ORDER BY k;"
141+
qt_regexp_extract_all_array_11 "SELECT regexp_extract_all_array(k, concat('^', k)) from test_string_function_regexp WHERE k IS NOT NULL ORDER BY k;"
142+
131143
qt_sql "SELECT regexp_replace('a b c', \" \", \"-\");"
132144
qt_sql "SELECT regexp_replace('a b c','(b)','<\\\\1>');"
133145

0 commit comments

Comments
 (0)