Skip to content

Commit f625a52

Browse files
committed
rex - initial implementation
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent b220be4 commit f625a52

12 files changed

Lines changed: 223 additions & 0 deletions

File tree

core/src/main/java/org/opensearch/sql/analysis/Analyzer.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
import org.opensearch.sql.ast.tree.RareTopN;
8181
import org.opensearch.sql.ast.tree.Relation;
8282
import org.opensearch.sql.ast.tree.RelationSubquery;
83+
import org.opensearch.sql.ast.tree.Rex;
8384
import org.opensearch.sql.ast.tree.Rename;
8485
import org.opensearch.sql.ast.tree.Reverse;
8586
import org.opensearch.sql.ast.tree.Sort;
@@ -736,6 +737,12 @@ public LogicalPlan visitReverse(Reverse node, AnalysisContext context) {
736737
"REVERSE is supported only when " + CALCITE_ENGINE_ENABLED.getKeyValue() + "=true");
737738
}
738739

740+
@Override
741+
public LogicalPlan visitRex(Rex node, AnalysisContext context) {
742+
throw new UnsupportedOperationException(
743+
"REX is supported only when " + CALCITE_ENGINE_ENABLED.getKeyValue() + "=true");
744+
}
745+
739746
@Override
740747
public LogicalPlan visitPaginate(Paginate paginate, AnalysisContext context) {
741748
LogicalPlan child = paginate.getChild().get(0).accept(this, context);

core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
import org.opensearch.sql.ast.tree.Project;
6868
import org.opensearch.sql.ast.tree.RareTopN;
6969
import org.opensearch.sql.ast.tree.Relation;
70+
import org.opensearch.sql.ast.tree.Rex;
7071
import org.opensearch.sql.ast.tree.RelationSubquery;
7172
import org.opensearch.sql.ast.tree.Rename;
7273
import org.opensearch.sql.ast.tree.Reverse;
@@ -249,6 +250,9 @@ public T visitReverse(Reverse node, C context) {
249250
return visitChildren(node, context);
250251
}
251252

253+
public T visitRex(Rex node, C context) {
254+
return visitChildren(node, context);
255+
}
252256
public T visitLambdaFunction(LambdaFunction node, C context) {
253257
return visitChildren(node, context);
254258
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.ast.tree;
7+
8+
import com.google.common.collect.ImmutableList;
9+
import java.util.List;
10+
import java.util.Optional;
11+
import lombok.EqualsAndHashCode;
12+
import lombok.Getter;
13+
import lombok.Setter;
14+
import lombok.ToString;
15+
import org.opensearch.sql.ast.AbstractNodeVisitor;
16+
import org.opensearch.sql.ast.expression.Literal;
17+
import org.opensearch.sql.ast.expression.UnresolvedExpression;
18+
19+
/** AST node represent Rex field extraction operation. */
20+
@Getter
21+
@ToString
22+
@EqualsAndHashCode(callSuper = false)
23+
public class Rex extends UnresolvedPlan {
24+
/** Field to extract from. */
25+
private final UnresolvedExpression field;
26+
27+
/** Pattern with named capture groups. */
28+
private final Literal pattern;
29+
30+
/** Maximum number of matches (optional). */
31+
private final Optional<Integer> maxMatch;
32+
33+
/** Child Plan. */
34+
@Setter private UnresolvedPlan child;
35+
36+
public Rex(UnresolvedExpression field, Literal pattern) {
37+
this(field, pattern, Optional.empty());
38+
}
39+
40+
public Rex(UnresolvedExpression field, Literal pattern, Optional<Integer> maxMatch) {
41+
this.field = field;
42+
this.pattern = pattern;
43+
this.maxMatch = maxMatch;
44+
}
45+
46+
@Override
47+
public Rex attach(UnresolvedPlan child) {
48+
this.child = child;
49+
return this;
50+
}
51+
52+
@Override
53+
public List<UnresolvedPlan> getChild() {
54+
return ImmutableList.of(child);
55+
}
56+
57+
@Override
58+
public <T, C> T accept(AbstractNodeVisitor<T, C> nodeVisitor, C context) {
59+
return nodeVisitor.visitRex(this, context);
60+
}
61+
}

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@
100100
import org.opensearch.sql.ast.tree.RareTopN;
101101
import org.opensearch.sql.ast.tree.Relation;
102102
import org.opensearch.sql.ast.tree.Rename;
103+
import org.opensearch.sql.ast.tree.Rex;
103104
import org.opensearch.sql.ast.tree.Sort;
104105
import org.opensearch.sql.ast.tree.Sort.SortOption;
105106
import org.opensearch.sql.ast.tree.SubqueryAlias;
@@ -117,6 +118,7 @@
117118
import org.opensearch.sql.common.utils.StringUtils;
118119
import org.opensearch.sql.exception.CalciteUnsupportedException;
119120
import org.opensearch.sql.exception.SemanticCheckException;
121+
import org.opensearch.sql.expression.parse.RegexCommonUtils;
120122
import org.opensearch.sql.expression.function.BuiltinFunctionName;
121123
import org.opensearch.sql.expression.function.PPLFuncImpTable;
122124
import org.opensearch.sql.utils.ParseUtils;
@@ -171,6 +173,41 @@ public RelNode visitFilter(Filter node, CalcitePlanContext context) {
171173
return context.relBuilder.peek();
172174
}
173175

176+
@Override
177+
public RelNode visitRex(Rex node, CalcitePlanContext context) {
178+
visitChildren(node, context);
179+
180+
RexNode fieldRex = rexVisitor.analyze(node.getField(), context);
181+
String patternStr = (String) node.getPattern().getValue();
182+
List<String> namedGroups = RegexCommonUtils.getNamedGroupCandidates(patternStr);
183+
184+
if (namedGroups.isEmpty()) {
185+
throw new IllegalArgumentException("Rex pattern must contain at least one named capture group");
186+
}
187+
188+
// Filter matching rows on data nodes using script pushdown
189+
RexNode regexMatchCondition = context.rexBuilder.makeCall(
190+
org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS,
191+
fieldRex,
192+
context.rexBuilder.makeLiteral(patternStr)
193+
);
194+
context.relBuilder.filter(regexMatchCondition);
195+
196+
// Extract fields from filtered data
197+
List<RexNode> newFields = new ArrayList<>();
198+
for (int i = 0; i < namedGroups.size(); i++) {
199+
RexNode extractCall = PPLFuncImpTable.INSTANCE.resolve(
200+
context.rexBuilder,
201+
BuiltinFunctionName.REX_EXTRACT,
202+
fieldRex,
203+
context.rexBuilder.makeLiteral(patternStr),
204+
context.relBuilder.literal(i + 1));
205+
newFields.add(extractCall);
206+
}
207+
208+
projectPlusOverriding(newFields, namedGroups, context);
209+
return context.relBuilder.peek();
210+
}
174211
private boolean containsSubqueryExpression(Node expr) {
175212
if (expr == null) {
176213
return false;

core/src/main/java/org/opensearch/sql/calcite/utils/PPLOperandTypes.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ private PPLOperandTypes() {}
4747
UDFOperandMetadata.wrap((FamilyOperandTypeChecker) OperandTypes.NUMERIC_NUMERIC);
4848
public static final UDFOperandMetadata STRING_INTEGER =
4949
UDFOperandMetadata.wrap(OperandTypes.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.INTEGER));
50+
public static final UDFOperandMetadata STRING_STRING_INTEGER =
51+
UDFOperandMetadata.wrap(OperandTypes.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER, SqlTypeFamily.INTEGER));
5052

5153
public static final UDFOperandMetadata NUMERIC_NUMERIC_NUMERIC =
5254
UDFOperandMetadata.wrap(

core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ public enum BuiltinFunctionName {
215215
LTRIM(FunctionName.of("ltrim")),
216216
POSITION(FunctionName.of("position")),
217217
REGEXP(FunctionName.of("regexp")),
218+
REX_EXTRACT(FunctionName.of("REX_EXTRACT")),
218219
REPLACE(FunctionName.of("replace")),
219220
REVERSE(FunctionName.of("reverse")),
220221
RIGHT(FunctionName.of("right")),

core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import org.opensearch.sql.expression.function.udf.CryptographicFunction;
4949
import org.opensearch.sql.expression.function.udf.GrokFunction;
5050
import org.opensearch.sql.expression.function.udf.RelevanceQueryFunction;
51+
import org.opensearch.sql.expression.function.udf.RexExtractFunction;
5152
import org.opensearch.sql.expression.function.udf.SpanFunction;
5253
import org.opensearch.sql.expression.function.udf.condition.EarliestFunction;
5354
import org.opensearch.sql.expression.function.udf.condition.LatestFunction;
@@ -380,6 +381,7 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable {
380381
RELEVANCE_QUERY_FUNCTION_INSTANCE.toUDF("multi_match", false);
381382
public static final SqlOperator NUMBER_TO_STRING =
382383
new NumberToStringFunction().toUDF("NUMBER_TO_STRING");
384+
public static final SqlOperator REX_EXTRACT = new RexExtractFunction().toUDF("REX_EXTRACT");
383385

384386
/**
385387
* Returns the PPL specific operator table, creating it if necessary.

core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@
160160
import static org.opensearch.sql.expression.function.BuiltinFunctionName.RAND;
161161
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REDUCE;
162162
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REGEXP;
163+
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT;
163164
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REPLACE;
164165
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REVERSE;
165166
import static org.opensearch.sql.expression.function.BuiltinFunctionName.RIGHT;
@@ -711,6 +712,7 @@ void populate() {
711712
registerOperator(SIMPLE_QUERY_STRING, PPLBuiltinOperators.SIMPLE_QUERY_STRING);
712713
registerOperator(QUERY_STRING, PPLBuiltinOperators.QUERY_STRING);
713714
registerOperator(MULTI_MATCH, PPLBuiltinOperators.MULTI_MATCH);
715+
registerOperator(REX_EXTRACT, PPLBuiltinOperators.REX_EXTRACT);
714716

715717
// Register PPL Datetime UDF operator
716718
registerOperator(TIMESTAMP, PPLBuiltinOperators.TIMESTAMP);
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.expression.function.udf;
7+
8+
import org.apache.calcite.adapter.enumerable.NotNullImplementor;
9+
import org.apache.calcite.adapter.enumerable.NullPolicy;
10+
import org.apache.calcite.adapter.enumerable.RexToLixTranslator;
11+
import org.apache.calcite.linq4j.tree.Expression;
12+
import org.apache.calcite.linq4j.tree.Expressions;
13+
import org.apache.calcite.rex.RexCall;
14+
import org.apache.calcite.sql.type.ReturnTypes;
15+
import org.apache.calcite.sql.type.SqlReturnTypeInference;
16+
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
17+
import org.opensearch.sql.expression.function.ImplementorUDF;
18+
import org.opensearch.sql.expression.function.UDFOperandMetadata;
19+
20+
import java.util.List;
21+
22+
/**
23+
* Custom REX_EXTRACT function for extracting regex named capture groups.
24+
*/
25+
public final class RexExtractFunction extends ImplementorUDF {
26+
27+
public RexExtractFunction() {
28+
super(new RexExtractImplementor(), NullPolicy.ARG0);
29+
}
30+
31+
@Override
32+
public SqlReturnTypeInference getReturnTypeInference() {
33+
return ReturnTypes.VARCHAR_2000_NULLABLE;
34+
}
35+
36+
@Override
37+
public UDFOperandMetadata getOperandMetadata() {
38+
return PPLOperandTypes.STRING_STRING_INTEGER;
39+
}
40+
41+
private static class RexExtractImplementor implements NotNullImplementor {
42+
43+
@Override
44+
public Expression implement(
45+
RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
46+
Expression field = translatedOperands.get(0);
47+
Expression pattern = translatedOperands.get(1);
48+
Expression groupIndex = translatedOperands.get(2);
49+
50+
return Expressions.call(
51+
RexExtractFunction.class,
52+
"extractGroup",
53+
field,
54+
pattern,
55+
groupIndex
56+
);
57+
}
58+
}
59+
public static String extractGroup(String text, String pattern, int groupIndex) {
60+
if (text == null || pattern == null) {
61+
return null;
62+
}
63+
64+
try {
65+
java.util.regex.Pattern compiledPattern = java.util.regex.Pattern.compile(pattern);
66+
java.util.regex.Matcher matcher = compiledPattern.matcher(text);
67+
68+
if (matcher.find() && groupIndex > 0 && groupIndex <= matcher.groupCount()) {
69+
return matcher.group(groupIndex);
70+
}
71+
72+
return null;
73+
} catch (Exception e) {
74+
return null;
75+
}
76+
}
77+
}

ppl/src/main/antlr/OpenSearchPPLLexer.g4

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ EXPLAIN: 'EXPLAIN';
1717
FROM: 'FROM';
1818
WHERE: 'WHERE';
1919
FIELDS: 'FIELDS';
20+
FIELD: 'FIELD';
2021
TABLE: 'TABLE'; // Alias for FIELDS command
2122
RENAME: 'RENAME';
2223
STATS: 'STATS';
@@ -29,6 +30,7 @@ TOP: 'TOP';
2930
RARE: 'RARE';
3031
PARSE: 'PARSE';
3132
REGEX: 'REGEX';
33+
REX: 'REX';
3234
PUNCT: 'PUNCT';
3335
GROK: 'GROK';
3436
PATTERN: 'PATTERN';
@@ -48,6 +50,7 @@ VARIABLE_COUNT_THRESHOLD: 'VARIABLE_COUNT_THRESHOLD';
4850
FREQUENCY_THRESHOLD_PERCENTAGE: 'FREQUENCY_THRESHOLD_PERCENTAGE';
4951
METHOD: 'METHOD';
5052
MAX_SAMPLE_COUNT: 'MAX_SAMPLE_COUNT';
53+
MAX_MATCH: 'MAX_MATCH';
5154
BUFFER_LIMIT: 'BUFFER_LIMIT';
5255
LABEL: 'LABEL';
5356
AGGREGATION: 'AGGREGATION';

0 commit comments

Comments
 (0)