Skip to content

Commit 050855d

Browse files
committed
[Feature] Implementation of mode sed and offset_field in rex PPL command
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent c186686 commit 050855d

13 files changed

Lines changed: 434 additions & 6 deletions

File tree

core/src/main/java/org/opensearch/sql/ast/tree/Rex.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,38 +23,52 @@
2323
public class Rex extends UnresolvedPlan {
2424

2525
public enum RexMode {
26-
EXTRACT
26+
EXTRACT,
27+
SED
2728
}
2829

2930
/** Field to extract from. */
3031
private final UnresolvedExpression field;
3132

32-
/** Pattern with named capture groups. */
33+
/** Pattern with named capture groups or sed expression. */
3334
private final Literal pattern;
3435

35-
/** Rex mode (only EXTRACT supported). */
36+
/** Rex mode (extract or sed). */
3637
private final RexMode mode;
3738

3839
/** Maximum number of matches (optional). */
3940
private final Optional<Integer> maxMatch;
4041

42+
/** Offset field name for position tracking (optional). */
43+
private final Optional<String> offsetField;
44+
4145
/** Child Plan. */
4246
@Setter private UnresolvedPlan child;
4347

4448
public Rex(UnresolvedExpression field, Literal pattern) {
45-
this(field, pattern, RexMode.EXTRACT, Optional.empty());
49+
this(field, pattern, RexMode.EXTRACT, Optional.empty(), Optional.empty());
4650
}
4751

4852
public Rex(UnresolvedExpression field, Literal pattern, Optional<Integer> maxMatch) {
49-
this(field, pattern, RexMode.EXTRACT, maxMatch);
53+
this(field, pattern, RexMode.EXTRACT, maxMatch, Optional.empty());
5054
}
5155

5256
public Rex(
5357
UnresolvedExpression field, Literal pattern, RexMode mode, Optional<Integer> maxMatch) {
58+
this(field, pattern, mode, maxMatch, Optional.empty());
59+
}
60+
61+
public Rex(
62+
UnresolvedExpression field,
63+
Literal pattern,
64+
RexMode mode,
65+
Optional<Integer> maxMatch,
66+
Optional<String> offsetField) {
5467
this.field = field;
5568
this.pattern = pattern;
5669
this.mode = mode;
5770
this.maxMatch = maxMatch;
71+
this.offsetField = offsetField;
5872
}
5973

6074
@Override

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,13 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
216216
RexNode fieldRex = rexVisitor.analyze(node.getField(), context);
217217
String patternStr = (String) node.getPattern().getValue();
218218

219+
if (node.getMode() == Rex.RexMode.SED) {
220+
RexNode sedCall = createOptimizedSedCall(fieldRex, patternStr, context);
221+
String fieldName = node.getField().toString();
222+
projectPlusOverriding(List.of(sedCall), List.of(fieldName), context);
223+
return context.relBuilder.peek();
224+
}
225+
219226
List<String> namedGroups = RegexCommonUtils.getNamedGroupCandidates(patternStr);
220227

221228
if (namedGroups.isEmpty()) {
@@ -250,6 +257,17 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
250257
newFieldNames.add(namedGroups.get(i));
251258
}
252259

260+
if (node.getOffsetField().isPresent()) {
261+
RexNode offsetCall =
262+
PPLFuncImpTable.INSTANCE.resolve(
263+
context.rexBuilder,
264+
BuiltinFunctionName.REX_OFFSET,
265+
fieldRex,
266+
context.rexBuilder.makeLiteral(patternStr));
267+
newFields.add(offsetCall);
268+
newFieldNames.add(node.getOffsetField().get());
269+
}
270+
253271
projectPlusOverriding(newFields, newFieldNames, context);
254272
return context.relBuilder.peek();
255273
}
@@ -2108,4 +2126,118 @@ private void buildExpandRelNode(
21082126
context.relBuilder.rename(names);
21092127
}
21102128
}
2129+
2130+
/**
2131+
* Creates an optimized sed call using native Calcite functions where possible. Falls back to
2132+
* custom REX_SED for complex cases.
2133+
*/
2134+
private RexNode createOptimizedSedCall(
2135+
RexNode fieldRex, String sedExpression, CalcitePlanContext context) {
2136+
if (sedExpression.startsWith("s/")) {
2137+
return createOptimizedSubstitution(fieldRex, sedExpression, context);
2138+
} else if (sedExpression.startsWith("y/")) {
2139+
return createOptimizedTransliteration(fieldRex, sedExpression, context);
2140+
} else {
2141+
throw new RuntimeException("Unsupported sed pattern: " + sedExpression);
2142+
}
2143+
}
2144+
2145+
/** Creates optimized substitution calls for s/pattern/replacement/flags syntax. */
2146+
private RexNode createOptimizedSubstitution(
2147+
RexNode fieldRex, String sedExpression, CalcitePlanContext context) {
2148+
try {
2149+
// Parse sed substitution: s/pattern/replacement/flags
2150+
if (!sedExpression.matches("s/.+/.*/.*")) {
2151+
throw new IllegalArgumentException("Invalid sed substitution format");
2152+
}
2153+
2154+
// Find the delimiters - sed format is s/pattern/replacement/flags
2155+
int firstDelimiter = sedExpression.indexOf('/', 2); // First '/' after 's/'
2156+
int secondDelimiter = sedExpression.indexOf('/', firstDelimiter + 1); // Second '/'
2157+
int thirdDelimiter = sedExpression.indexOf('/', secondDelimiter + 1); // Third '/' (optional)
2158+
2159+
if (firstDelimiter == -1 || secondDelimiter == -1) {
2160+
throw new IllegalArgumentException("Invalid sed substitution format");
2161+
}
2162+
2163+
String pattern = sedExpression.substring(2, firstDelimiter);
2164+
String replacement = sedExpression.substring(firstDelimiter + 1, secondDelimiter);
2165+
String flags =
2166+
secondDelimiter + 1 < sedExpression.length()
2167+
? sedExpression.substring(secondDelimiter + 1)
2168+
: "";
2169+
2170+
// Convert sed backreferences (\1, \2) to Java style ($1, $2)
2171+
String javaReplacement = replacement.replaceAll("\\\\(\\d+)", "\\$$1");
2172+
2173+
if (flags.isEmpty()) {
2174+
// 3-parameter REGEXP_REPLACE
2175+
return PPLFuncImpTable.INSTANCE.resolve(
2176+
context.rexBuilder,
2177+
BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_3,
2178+
fieldRex,
2179+
context.rexBuilder.makeLiteral(pattern),
2180+
context.rexBuilder.makeLiteral(javaReplacement));
2181+
} else if (flags.matches("[gi]+")) {
2182+
// 4-parameter REGEXP_REPLACE with flags
2183+
return PPLFuncImpTable.INSTANCE.resolve(
2184+
context.rexBuilder,
2185+
BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_PG_4,
2186+
fieldRex,
2187+
context.rexBuilder.makeLiteral(pattern),
2188+
context.rexBuilder.makeLiteral(javaReplacement),
2189+
context.rexBuilder.makeLiteral(flags));
2190+
} else if (flags.matches("\\d+")) {
2191+
// 5-parameter REGEXP_REPLACE with occurrence
2192+
int occurrence = Integer.parseInt(flags);
2193+
return PPLFuncImpTable.INSTANCE.resolve(
2194+
context.rexBuilder,
2195+
BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_5,
2196+
fieldRex,
2197+
context.rexBuilder.makeLiteral(pattern),
2198+
context.rexBuilder.makeLiteral(javaReplacement),
2199+
context.relBuilder.literal(1), // start position
2200+
context.relBuilder.literal(occurrence));
2201+
} else {
2202+
throw new RuntimeException(
2203+
"Unsupported sed flags: " + flags + " in expression: " + sedExpression);
2204+
}
2205+
} catch (Exception e) {
2206+
throw new RuntimeException("Failed to optimize sed expression: " + sedExpression, e);
2207+
}
2208+
}
2209+
2210+
/** Creates optimized transliteration calls for y/from/to/ syntax. */
2211+
private RexNode createOptimizedTransliteration(
2212+
RexNode fieldRex, String sedExpression, CalcitePlanContext context) {
2213+
try {
2214+
// Parse sed transliteration: y/from/to/
2215+
if (!sedExpression.matches("y/.+/.*/.*")) {
2216+
throw new IllegalArgumentException("Invalid sed transliteration format");
2217+
}
2218+
2219+
int firstSlash = sedExpression.indexOf('/', 1);
2220+
int secondSlash = sedExpression.indexOf('/', firstSlash + 1);
2221+
int thirdSlash = sedExpression.indexOf('/', secondSlash + 1);
2222+
2223+
if (firstSlash == -1 || secondSlash == -1) {
2224+
throw new IllegalArgumentException("Invalid sed transliteration format");
2225+
}
2226+
2227+
String from = sedExpression.substring(firstSlash + 1, secondSlash);
2228+
String to =
2229+
sedExpression.substring(
2230+
secondSlash + 1, thirdSlash != -1 ? thirdSlash : sedExpression.length());
2231+
2232+
// Use Calcite's native TRANSLATE3 function
2233+
return PPLFuncImpTable.INSTANCE.resolve(
2234+
context.rexBuilder,
2235+
BuiltinFunctionName.INTERNAL_TRANSLATE3,
2236+
fieldRex,
2237+
context.rexBuilder.makeLiteral(from),
2238+
context.rexBuilder.makeLiteral(to));
2239+
} catch (Exception e) {
2240+
throw new RuntimeException("Failed to optimize sed expression: " + sedExpression, e);
2241+
}
2242+
}
21112243
}

core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ public enum BuiltinFunctionName {
221221
REGEX_MATCH(FunctionName.of("regex_match")),
222222
REX_EXTRACT(FunctionName.of("REX_EXTRACT")),
223223
REX_EXTRACT_MULTI(FunctionName.of("REX_EXTRACT_MULTI")),
224+
REX_OFFSET(FunctionName.of("REX_OFFSET")),
224225
REPLACE(FunctionName.of("replace")),
225226
REVERSE(FunctionName.of("reverse")),
226227
RIGHT(FunctionName.of("right")),

core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
import org.opensearch.sql.expression.function.udf.RelevanceQueryFunction;
5959
import org.opensearch.sql.expression.function.udf.RexExtractFunction;
6060
import org.opensearch.sql.expression.function.udf.RexExtractMultiFunction;
61+
import org.opensearch.sql.expression.function.udf.RexOffsetFunction;
6162
import org.opensearch.sql.expression.function.udf.SpanFunction;
6263
import org.opensearch.sql.expression.function.udf.condition.EarliestFunction;
6364
import org.opensearch.sql.expression.function.udf.condition.EnhancedCoalesceFunction;
@@ -406,6 +407,7 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable {
406407
public static final SqlOperator REX_EXTRACT = new RexExtractFunction().toUDF("REX_EXTRACT");
407408
public static final SqlOperator REX_EXTRACT_MULTI =
408409
new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI");
410+
public static final SqlOperator REX_OFFSET = new RexOffsetFunction().toUDF("REX_OFFSET");
409411

410412
// Aggregation functions
411413
public static final SqlAggFunction AVG_NULLABLE = new NullableSqlAvgAggFunction(SqlKind.AVG);

core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@
164164
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REVERSE;
165165
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT;
166166
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT_MULTI;
167+
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_OFFSET;
167168
import static org.opensearch.sql.expression.function.BuiltinFunctionName.RIGHT;
168169
import static org.opensearch.sql.expression.function.BuiltinFunctionName.RINT;
169170
import static org.opensearch.sql.expression.function.BuiltinFunctionName.ROUND;
@@ -713,6 +714,7 @@ void populate() {
713714
registerOperator(MULTI_MATCH, PPLBuiltinOperators.MULTI_MATCH);
714715
registerOperator(REX_EXTRACT, PPLBuiltinOperators.REX_EXTRACT);
715716
registerOperator(REX_EXTRACT_MULTI, PPLBuiltinOperators.REX_EXTRACT_MULTI);
717+
registerOperator(REX_OFFSET, PPLBuiltinOperators.REX_OFFSET);
716718

717719
// Register PPL Datetime UDF operator
718720
registerOperator(TIMESTAMP, PPLBuiltinOperators.TIMESTAMP);
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.expression.function.udf;
7+
8+
import java.util.List;
9+
import java.util.regex.Matcher;
10+
import java.util.regex.Pattern;
11+
import org.apache.calcite.adapter.enumerable.NotNullImplementor;
12+
import org.apache.calcite.adapter.enumerable.NullPolicy;
13+
import org.apache.calcite.adapter.enumerable.RexToLixTranslator;
14+
import org.apache.calcite.linq4j.tree.Expression;
15+
import org.apache.calcite.linq4j.tree.Expressions;
16+
import org.apache.calcite.rex.RexCall;
17+
import org.apache.calcite.sql.type.ReturnTypes;
18+
import org.apache.calcite.sql.type.SqlReturnTypeInference;
19+
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
20+
import org.opensearch.sql.expression.function.ImplementorUDF;
21+
import org.opensearch.sql.expression.function.UDFOperandMetadata;
22+
23+
/** Custom REX_OFFSET function for calculating regex match positions. */
24+
public final class RexOffsetFunction extends ImplementorUDF {
25+
26+
public RexOffsetFunction() {
27+
super(new RexOffsetImplementor(), NullPolicy.ARG0);
28+
}
29+
30+
@Override
31+
public SqlReturnTypeInference getReturnTypeInference() {
32+
return ReturnTypes.VARCHAR_2000_NULLABLE;
33+
}
34+
35+
@Override
36+
public UDFOperandMetadata getOperandMetadata() {
37+
return PPLOperandTypes.STRING_STRING;
38+
}
39+
40+
private static class RexOffsetImplementor implements NotNullImplementor {
41+
42+
@Override
43+
public Expression implement(
44+
RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
45+
Expression field = translatedOperands.get(0);
46+
Expression pattern = translatedOperands.get(1);
47+
48+
return Expressions.call(RexOffsetFunction.class, "calculateOffsets", field, pattern);
49+
}
50+
}
51+
52+
public static String calculateOffsets(String text, String patternStr) {
53+
if (text == null || patternStr == null) {
54+
return null;
55+
}
56+
57+
try {
58+
Pattern pattern = Pattern.compile(patternStr);
59+
Matcher matcher = pattern.matcher(text);
60+
61+
if (!matcher.find()) {
62+
return null;
63+
}
64+
65+
List<String> offsetPairs = new java.util.ArrayList<>();
66+
67+
Pattern namedGroupPattern = Pattern.compile("\\(\\?<([^>]+)>");
68+
Matcher namedGroupMatcher = namedGroupPattern.matcher(patternStr);
69+
70+
int groupIndex = 1;
71+
72+
while (namedGroupMatcher.find()) {
73+
String groupName = namedGroupMatcher.group(1);
74+
75+
if (groupIndex <= matcher.groupCount()) {
76+
int start = matcher.start(groupIndex);
77+
int end = matcher.end(groupIndex);
78+
79+
if (start >= 0 && end >= 0) {
80+
offsetPairs.add(groupName + "=" + start + "-" + (end - 1));
81+
}
82+
}
83+
groupIndex++;
84+
}
85+
86+
java.util.Collections.reverse(offsetPairs);
87+
return offsetPairs.isEmpty() ? null : String.join("&", offsetPairs);
88+
} catch (Exception e) {
89+
return null;
90+
}
91+
}
92+
}

0 commit comments

Comments
 (0)