Skip to content

Commit 8ec0276

Browse files
committed
sed - initial implementation
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent 55aa9ab commit 8ec0276

9 files changed

Lines changed: 197 additions & 5 deletions

File tree

core/src/main/java/org/opensearch/sql/ast/tree/Rex.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,16 @@
2121
@ToString
2222
@EqualsAndHashCode(callSuper = false)
2323
public class Rex extends UnresolvedPlan {
24+
25+
public enum RexMode {
26+
EXTRACT,
27+
SED
28+
}
29+
2430
/** Field to extract from. */
2531
private final UnresolvedExpression field;
2632

27-
/** Pattern with named capture groups. */
33+
/** Pattern with named capture groups or sed expression. */
2834
private final Literal pattern;
2935

3036
/** Maximum number of matches (optional). */
@@ -33,26 +39,39 @@ public class Rex extends UnresolvedPlan {
3339
/** Offset field name for position tracking (optional). */
3440
private final Optional<String> offsetField;
3541

42+
/** Rex mode (extract or sed). */
43+
private final RexMode mode;
44+
3645
/** Child Plan. */
3746
@Setter private UnresolvedPlan child;
3847

3948
public Rex(UnresolvedExpression field, Literal pattern) {
40-
this(field, pattern, Optional.empty(), Optional.empty());
49+
this(field, pattern, Optional.empty(), Optional.empty(), RexMode.EXTRACT);
4150
}
4251

4352
public Rex(UnresolvedExpression field, Literal pattern, Optional<Integer> maxMatch) {
44-
this(field, pattern, maxMatch, Optional.empty());
53+
this(field, pattern, maxMatch, Optional.empty(), RexMode.EXTRACT);
4554
}
4655

4756
public Rex(
4857
UnresolvedExpression field,
4958
Literal pattern,
5059
Optional<Integer> maxMatch,
5160
Optional<String> offsetField) {
61+
this(field, pattern, maxMatch, offsetField, RexMode.EXTRACT);
62+
}
63+
64+
public Rex(
65+
UnresolvedExpression field,
66+
Literal pattern,
67+
Optional<Integer> maxMatch,
68+
Optional<String> offsetField,
69+
RexMode mode) {
5270
this.field = field;
5371
this.pattern = pattern;
5472
this.maxMatch = maxMatch;
5573
this.offsetField = offsetField;
74+
this.mode = mode;
5675
}
5776

5877
@Override

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,20 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
180180

181181
RexNode fieldRex = rexVisitor.analyze(node.getField(), context);
182182
String patternStr = (String) node.getPattern().getValue();
183+
184+
if (node.getMode() == Rex.RexMode.SED) {
185+
RexNode sedCall =
186+
PPLFuncImpTable.INSTANCE.resolve(
187+
context.rexBuilder,
188+
BuiltinFunctionName.REX_SED,
189+
fieldRex,
190+
context.rexBuilder.makeLiteral(patternStr));
191+
192+
String fieldName = node.getField().toString();
193+
projectPlusOverriding(List.of(sedCall), List.of(fieldName), context);
194+
return context.relBuilder.peek();
195+
}
196+
183197
List<String> namedGroups = RegexExpression.getNamedGroupCandidates(patternStr);
184198

185199
if (namedGroups.isEmpty()) {

core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ public enum BuiltinFunctionName {
218218
REX_EXTRACT(FunctionName.of("REX_EXTRACT")),
219219
REX_EXTRACT_MULTI(FunctionName.of("REX_EXTRACT_MULTI")),
220220
REX_OFFSET(FunctionName.of("REX_OFFSET")),
221+
REX_SED(FunctionName.of("REX_SED")),
221222
REPLACE(FunctionName.of("replace")),
222223
REVERSE(FunctionName.of("reverse")),
223224
RIGHT(FunctionName.of("right")),

core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import org.opensearch.sql.expression.function.udf.RexExtractFunction;
5252
import org.opensearch.sql.expression.function.udf.RexExtractMultiFunction;
5353
import org.opensearch.sql.expression.function.udf.RexOffsetFunction;
54+
import org.opensearch.sql.expression.function.udf.RexSedFunction;
5455
import org.opensearch.sql.expression.function.udf.SpanFunction;
5556
import org.opensearch.sql.expression.function.udf.condition.EarliestFunction;
5657
import org.opensearch.sql.expression.function.udf.condition.LatestFunction;
@@ -387,6 +388,7 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable {
387388
public static final SqlOperator REX_EXTRACT_MULTI =
388389
new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI");
389390
public static final SqlOperator REX_OFFSET = new RexOffsetFunction().toUDF("REX_OFFSET");
391+
public static final SqlOperator REX_SED = new RexSedFunction().toUDF("REX_SED");
390392

391393
/**
392394
* Returns the PPL specific operator table, creating it if necessary.

core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@
165165
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT;
166166
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT_MULTI;
167167
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_OFFSET;
168+
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_SED;
168169
import static org.opensearch.sql.expression.function.BuiltinFunctionName.RIGHT;
169170
import static org.opensearch.sql.expression.function.BuiltinFunctionName.RINT;
170171
import static org.opensearch.sql.expression.function.BuiltinFunctionName.ROUND;
@@ -717,6 +718,7 @@ void populate() {
717718
registerOperator(REX_EXTRACT, PPLBuiltinOperators.REX_EXTRACT);
718719
registerOperator(REX_EXTRACT_MULTI, PPLBuiltinOperators.REX_EXTRACT_MULTI);
719720
registerOperator(REX_OFFSET, PPLBuiltinOperators.REX_OFFSET);
721+
registerOperator(REX_SED, PPLBuiltinOperators.REX_SED);
720722

721723
// Register PPL Datetime UDF operator
722724
registerOperator(TIMESTAMP, PPLBuiltinOperators.TIMESTAMP);
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.expression.function.udf;
7+
8+
import java.util.List;
9+
import java.util.regex.Matcher;
10+
import java.util.regex.Pattern;
11+
import org.apache.calcite.adapter.enumerable.NotNullImplementor;
12+
import org.apache.calcite.adapter.enumerable.NullPolicy;
13+
import org.apache.calcite.adapter.enumerable.RexToLixTranslator;
14+
import org.apache.calcite.linq4j.tree.Expression;
15+
import org.apache.calcite.linq4j.tree.Expressions;
16+
import org.apache.calcite.rex.RexCall;
17+
import org.apache.calcite.sql.type.ReturnTypes;
18+
import org.apache.calcite.sql.type.SqlReturnTypeInference;
19+
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
20+
import org.opensearch.sql.expression.function.ImplementorUDF;
21+
import org.opensearch.sql.expression.function.UDFOperandMetadata;
22+
23+
/** Custom REX_SED function for string replacement using sed expressions. */
24+
public final class RexSedFunction extends ImplementorUDF {
25+
26+
public RexSedFunction() {
27+
super(new RexSedImplementor(), NullPolicy.ARG0);
28+
}
29+
30+
@Override
31+
public SqlReturnTypeInference getReturnTypeInference() {
32+
return ReturnTypes.VARCHAR_2000_NULLABLE;
33+
}
34+
35+
@Override
36+
public UDFOperandMetadata getOperandMetadata() {
37+
return PPLOperandTypes.STRING_STRING;
38+
}
39+
40+
private static class RexSedImplementor implements NotNullImplementor {
41+
42+
@Override
43+
public Expression implement(
44+
RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
45+
Expression field = translatedOperands.get(0);
46+
Expression sedExpression = translatedOperands.get(1);
47+
48+
return Expressions.call(RexSedFunction.class, "applySed", field, sedExpression);
49+
}
50+
}
51+
52+
public static String applySed(String text, String sedExpression) {
53+
if (text == null || sedExpression == null) {
54+
return null;
55+
}
56+
57+
try {
58+
if (sedExpression.startsWith("s/")) {
59+
return applySubstitution(text, sedExpression);
60+
} else if (sedExpression.startsWith("y/")) {
61+
return applyTransliteration(text, sedExpression);
62+
} else {
63+
return text;
64+
}
65+
} catch (Exception e) {
66+
return text;
67+
}
68+
}
69+
70+
private static String applySubstitution(String text, String sedExpression) {
71+
Pattern sedPattern = Pattern.compile("s/(.*?)/(.*?)/(.*?)$");
72+
Matcher sedMatcher = sedPattern.matcher(sedExpression);
73+
74+
if (!sedMatcher.matches()) {
75+
return text;
76+
}
77+
78+
String regex = sedMatcher.group(1);
79+
String replacement = sedMatcher.group(2);
80+
String flags = sedMatcher.group(3);
81+
82+
try {
83+
Pattern regexPattern = Pattern.compile(regex);
84+
String javaReplacement = convertSedBackreferencesToJava(replacement);
85+
86+
if (flags.contains("g")) {
87+
return regexPattern.matcher(text).replaceAll(javaReplacement);
88+
} else if (flags.matches("\\d+")) {
89+
int nth = Integer.parseInt(flags);
90+
return replaceNthOccurrence(text, regexPattern, javaReplacement, nth);
91+
} else {
92+
return regexPattern.matcher(text).replaceFirst(javaReplacement);
93+
}
94+
} catch (Exception e) {
95+
return text;
96+
}
97+
}
98+
99+
private static String applyTransliteration(String text, String sedExpression) {
100+
Pattern yPattern = Pattern.compile("y/(.*?)/(.*?)/");
101+
Matcher yMatcher = yPattern.matcher(sedExpression);
102+
103+
if (!yMatcher.matches()) {
104+
return text;
105+
}
106+
107+
String from = yMatcher.group(1);
108+
String to = yMatcher.group(2);
109+
110+
if (from.length() != to.length()) {
111+
return text;
112+
}
113+
114+
StringBuilder result = new StringBuilder();
115+
for (char c : text.toCharArray()) {
116+
int index = from.indexOf(c);
117+
if (index >= 0) {
118+
result.append(to.charAt(index));
119+
} else {
120+
result.append(c);
121+
}
122+
}
123+
124+
return result.toString();
125+
}
126+
127+
private static String replaceNthOccurrence(
128+
String text, Pattern pattern, String replacement, int nth) {
129+
Matcher matcher = pattern.matcher(text);
130+
StringBuffer result = new StringBuffer();
131+
int count = 0;
132+
133+
while (matcher.find()) {
134+
count++;
135+
if (count == nth) {
136+
matcher.appendReplacement(result, replacement);
137+
} else {
138+
matcher.appendReplacement(result, matcher.group());
139+
}
140+
}
141+
matcher.appendTail(result);
142+
143+
return result.toString();
144+
}
145+
146+
private static String convertSedBackreferencesToJava(String sedReplacement) {
147+
return sedReplacement.replaceAll("\\\\(\\d+)", "\\$$1");
148+
}
149+
}

ppl/src/main/antlr/OpenSearchPPLLexer.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ METHOD: 'METHOD';
5252
MAX_SAMPLE_COUNT: 'MAX_SAMPLE_COUNT';
5353
MAX_MATCH: 'MAX_MATCH';
5454
OFFSET_FIELD: 'OFFSET_FIELD';
55+
SED: 'SED';
5556
BUFFER_LIMIT: 'BUFFER_LIMIT';
5657
LABEL: 'LABEL';
5758
AGGREGATION: 'AGGREGATION';

ppl/src/main/antlr/OpenSearchPPLParser.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ rexExpr
201201
rexOption
202202
: MAX_MATCH EQUAL maxMatch=integerLiteral
203203
| OFFSET_FIELD EQUAL offsetField=qualifiedName
204+
| MODE EQUAL SED
204205
;
205206
patternsMethod
206207
: PUNCT

ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -771,22 +771,25 @@ public UnresolvedPlan visitRexCommand(OpenSearchPPLParser.RexCommandContext ctx)
771771
Literal pattern = (Literal) internalVisitExpression(ctx.rexExpr().pattern);
772772
Optional<Integer> maxMatch = Optional.empty();
773773
Optional<String> offsetField = Optional.empty();
774+
Rex.RexMode mode = Rex.RexMode.EXTRACT;
774775

775776
if (ctx.rexExpr().field != null) {
776777
field = internalVisitExpression(ctx.rexExpr().field);
777778
}
778779

779-
// Process rex options
780780
for (OpenSearchPPLParser.RexOptionContext optionCtx : ctx.rexExpr().rexOption()) {
781781
if (optionCtx.maxMatch != null) {
782782
maxMatch = Optional.of(Integer.parseInt(optionCtx.maxMatch.getText()));
783783
}
784784
if (optionCtx.offsetField != null) {
785785
offsetField = Optional.of(internalVisitExpression(optionCtx.offsetField).toString());
786786
}
787+
if (optionCtx.MODE() != null && optionCtx.SED() != null) {
788+
mode = Rex.RexMode.SED;
789+
}
787790
}
788791

789-
return new Rex(field, pattern, maxMatch, offsetField);
792+
return new Rex(field, pattern, maxMatch, offsetField, mode);
790793
}
791794

792795
/** Get original text in query. */

0 commit comments

Comments
 (0)