Skip to content

Commit b8c70ae

Browse files
committed
regex implementation done with pcre4j without calcite
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent 7a6df3c commit b8c70ae

8 files changed

Lines changed: 177 additions & 138 deletions

File tree

core/src/main/java/org/opensearch/sql/analysis/Analyzer.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,10 @@
7777
import org.opensearch.sql.ast.tree.Patterns;
7878
import org.opensearch.sql.ast.tree.Project;
7979
import org.opensearch.sql.ast.tree.RareTopN;
80+
import org.opensearch.sql.ast.tree.Regex;
8081
import org.opensearch.sql.ast.tree.Relation;
8182
import org.opensearch.sql.ast.tree.RelationSubquery;
8283
import org.opensearch.sql.ast.tree.Rename;
83-
import org.opensearch.sql.ast.tree.Regex;
8484
import org.opensearch.sql.ast.tree.Reverse;
8585
import org.opensearch.sql.ast.tree.Sort;
8686
import org.opensearch.sql.ast.tree.Sort.SortOption;
@@ -692,10 +692,24 @@ public LogicalPlan visitReverse(Reverse node, AnalysisContext context) {
692692

693693
@Override
694694
public LogicalPlan visitRegex(Regex node, AnalysisContext context) {
695-
// For now, throw unsupported operation as we're building a PoC
696-
// This will be implemented when we add the execution logic
697-
throw new UnsupportedOperationException(
698-
"REGEX command is not yet fully implemented");
695+
// Get the child plan (source of data)
696+
LogicalPlan child = node.getChild().get(0).accept(this, context);
697+
698+
// Analyze the field and pattern expressions
699+
Expression fieldExpr = expressionAnalyzer.analyze(node.getField(), context);
700+
Expression patternExpr = expressionAnalyzer.analyze(node.getPattern(), context);
701+
702+
// Create the RegexMatch expression directly
703+
// This is our custom PCRE-based implementation
704+
Expression regexExpr =
705+
new org.opensearch.sql.expression.operator.predicate.RegexMatch(
706+
fieldExpr, patternExpr, node.isNegated());
707+
708+
// Return a LogicalFilter with the regex condition
709+
// No need for optimization since RegexMatch is already a concrete expression
710+
LogicalFilter result = new LogicalFilter(child, regexExpr);
711+
712+
return result;
699713
}
700714

701715
@Override

core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@
6666
import org.opensearch.sql.ast.tree.Patterns;
6767
import org.opensearch.sql.ast.tree.Project;
6868
import org.opensearch.sql.ast.tree.RareTopN;
69+
import org.opensearch.sql.ast.tree.Regex;
6970
import org.opensearch.sql.ast.tree.Relation;
7071
import org.opensearch.sql.ast.tree.RelationSubquery;
7172
import org.opensearch.sql.ast.tree.Rename;
72-
import org.opensearch.sql.ast.tree.Regex;
7373
import org.opensearch.sql.ast.tree.Reverse;
7474
import org.opensearch.sql.ast.tree.Sort;
7575
import org.opensearch.sql.ast.tree.SubqueryAlias;

core/src/main/java/org/opensearch/sql/ast/tree/Regex.java

Lines changed: 34 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -20,39 +20,38 @@
2020
@ToString
2121
@EqualsAndHashCode(callSuper = false)
2222
public class Regex extends UnresolvedPlan {
23-
/** Field to match against. */
24-
private final UnresolvedExpression field;
25-
26-
/** Whether this is a negated match (!=). */
27-
private final boolean negated;
28-
29-
/** Pattern. */
30-
private final Literal pattern;
31-
32-
/** Child Plan. */
33-
@Setter
34-
private UnresolvedPlan child;
35-
36-
public Regex(UnresolvedExpression field, String operator, Literal pattern) {
37-
// Require explicit field - no default to _source for PoC
38-
this.field = field;
39-
this.negated = "!=".equals(operator);
40-
this.pattern = pattern;
41-
}
42-
43-
@Override
44-
public Regex attach(UnresolvedPlan child) {
45-
this.child = child;
46-
return this;
47-
}
48-
49-
@Override
50-
public List<UnresolvedPlan> getChild() {
51-
return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child);
52-
}
53-
54-
@Override
55-
public <T, C> T accept(AbstractNodeVisitor<T, C> nodeVisitor, C context) {
56-
return nodeVisitor.visitRegex(this, context);
57-
}
23+
/** Field to match against. */
24+
private final UnresolvedExpression field;
25+
26+
/** Whether this is a negated match (!=). */
27+
private final boolean negated;
28+
29+
/** Pattern. */
30+
private final Literal pattern;
31+
32+
/** Child Plan. */
33+
@Setter private UnresolvedPlan child;
34+
35+
public Regex(UnresolvedExpression field, String operator, Literal pattern) {
36+
// Require explicit field - no default to _source for PoC
37+
this.field = field;
38+
this.negated = "!=".equals(operator);
39+
this.pattern = pattern;
40+
}
41+
42+
@Override
43+
public Regex attach(UnresolvedPlan child) {
44+
this.child = child;
45+
return this;
46+
}
47+
48+
@Override
49+
public List<UnresolvedPlan> getChild() {
50+
return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child);
51+
}
52+
53+
@Override
54+
public <T, C> T accept(AbstractNodeVisitor<T, C> nodeVisitor, C context) {
55+
return nodeVisitor.visitRegex(this, context);
56+
}
5857
}

core/src/main/java/org/opensearch/sql/expression/ExpressionNodeVisitor.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,4 +101,14 @@ public T visitWhen(WhenClause node, C context) {
101101
public T visitNamedArgument(NamedArgumentExpression node, C context) {
102102
return visitNode(node, context);
103103
}
104+
105+
public T visitRegex(org.opensearch.sql.expression.operator.predicate.RegexMatch node, C context) {
106+
// Visit field and pattern expressions to ensure field extraction works properly
107+
T result = defaultResult();
108+
T fieldResult = node.getField().accept(this, context);
109+
result = aggregateResult(result, fieldResult);
110+
T patternResult = node.getPattern().accept(this, context);
111+
result = aggregateResult(result, patternResult);
112+
return result;
113+
}
104114
}

core/src/main/java/org/opensearch/sql/expression/operator/predicate/RegexMatch.java

Lines changed: 86 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -9,117 +9,107 @@
99
import lombok.EqualsAndHashCode;
1010
import lombok.Getter;
1111
import lombok.ToString;
12-
import org.pcre4j.Pcre4j;
13-
import org.pcre4j.jna.Pcre2;
14-
import org.pcre4j.regex.Pattern;
15-
import org.pcre4j.regex.Matcher;
16-
import org.opensearch.sql.data.model.ExprBooleanValue;
1712
import org.opensearch.sql.data.model.ExprValue;
1813
import org.opensearch.sql.data.model.ExprValueUtils;
1914
import org.opensearch.sql.data.type.ExprCoreType;
2015
import org.opensearch.sql.data.type.ExprType;
2116
import org.opensearch.sql.expression.Expression;
2217
import org.opensearch.sql.expression.ExpressionNodeVisitor;
2318
import org.opensearch.sql.expression.env.Environment;
24-
import org.opensearch.sql.expression.function.FunctionName;
19+
import org.pcre4j.Pcre4j;
20+
import org.pcre4j.jna.Pcre2;
21+
import org.pcre4j.regex.Matcher;
22+
import org.pcre4j.regex.Pattern;
2523

2624
/**
27-
* Expression for PCRE-compatible regex matching using JPCRE2.
28-
* Supports full PCRE features including:
29-
* - Named groups (?<name>...)
30-
* - Lookahead/lookbehind (including variable-length)
31-
* - Backreferences
32-
* - Recursion (?R) and named recursion (?&name)
33-
* - Conditionals (?(condition)yes|no)
25+
* Expression for PCRE-compatible regex matching using JPCRE2. Supports full PCRE features
26+
* including: - Named groups (?<name>...) - Lookahead/lookbehind (including variable-length) -
27+
* Backreferences - Recursion (?R) and named recursion (?&name) - Conditionals (?(condition)yes|no)
3428
* - Inline flags (?i), (?m), (?s), etc.
3529
*/
3630
@ToString
3731
@EqualsAndHashCode
3832
public class RegexMatch implements Expression {
39-
@Getter
40-
private final Expression field;
41-
42-
@Getter
43-
private final Expression pattern;
44-
45-
@Getter
46-
private final boolean negated;
47-
48-
// Pattern cache to avoid recompiling the same patterns
49-
private static final ConcurrentHashMap<String, Pattern> patternCache =
50-
new ConcurrentHashMap<>();
51-
52-
// Maximum cache size to prevent memory issues
53-
private static final int MAX_CACHE_SIZE = 1000;
54-
55-
// Initialize PCRE4J with JNA backend (done once)
56-
static {
57-
Pcre4j.setup(new Pcre2());
33+
@Getter private final Expression field;
34+
35+
@Getter private final Expression pattern;
36+
37+
@Getter private final boolean negated;
38+
39+
// Pattern cache to avoid recompiling the same patterns
40+
private static final ConcurrentHashMap<String, Pattern> patternCache = new ConcurrentHashMap<>();
41+
42+
// Maximum cache size to prevent memory issues
43+
private static final int MAX_CACHE_SIZE = 1000;
44+
45+
// Initialize PCRE4J with JNA backend (done once)
46+
static {
47+
Pcre4j.setup(new Pcre2());
48+
}
49+
50+
public RegexMatch(Expression field, Expression pattern, boolean negated) {
51+
this.field = field;
52+
this.pattern = pattern;
53+
this.negated = negated;
54+
}
55+
56+
@Override
57+
public ExprValue valueOf(Environment<Expression, ExprValue> valueEnv) {
58+
ExprValue fieldValue = field.valueOf(valueEnv);
59+
ExprValue patternValue = pattern.valueOf(valueEnv);
60+
61+
// Handle null/missing values
62+
if (fieldValue.isNull()
63+
|| fieldValue.isMissing()
64+
|| patternValue.isNull()
65+
|| patternValue.isMissing()) {
66+
return ExprValueUtils.booleanValue(false);
5867
}
59-
60-
public RegexMatch(Expression field, Expression pattern, boolean negated) {
61-
this.field = field;
62-
this.pattern = pattern;
63-
this.negated = negated;
68+
69+
String text = fieldValue.stringValue();
70+
String regex = patternValue.stringValue();
71+
72+
try {
73+
// Get compiled pattern from cache or compile new one
74+
Pattern compiledPattern = getCompiledPattern(regex);
75+
76+
// Create matcher and check for match
77+
Matcher matcher = compiledPattern.matcher(text);
78+
boolean matches = matcher.find(); // Use find() for partial match like SPL
79+
80+
// Apply negation if needed
81+
return ExprValueUtils.booleanValue(negated ? !matches : matches);
82+
83+
} catch (Exception e) {
84+
// Return false on pattern compilation/matching errors
85+
// Note: In production, proper logging should be added here
86+
return ExprValueUtils.booleanValue(false);
6487
}
65-
66-
@Override
67-
public ExprValue valueOf(Environment<Expression, ExprValue> valueEnv) {
68-
ExprValue fieldValue = field.valueOf(valueEnv);
69-
ExprValue patternValue = pattern.valueOf(valueEnv);
70-
71-
// Handle null/missing values
72-
if (fieldValue.isNull() || fieldValue.isMissing() ||
73-
patternValue.isNull() || patternValue.isMissing()) {
74-
return ExprValueUtils.booleanValue(false);
75-
}
76-
77-
String text = fieldValue.stringValue();
78-
String regex = patternValue.stringValue();
79-
80-
try {
81-
// Get compiled pattern from cache or compile new one
82-
Pattern compiledPattern = getCompiledPattern(regex);
83-
84-
// Create matcher and check for match
85-
Matcher matcher = compiledPattern.matcher(text);
86-
boolean matches = matcher.find(); // Use find() for partial match like SPL
87-
88-
// Apply negation if needed
89-
return ExprValueUtils.booleanValue(negated ? !matches : matches);
90-
91-
} catch (Exception e) {
92-
// Log error and return false on pattern compilation/matching errors
93-
// In production, you'd want proper logging here
94-
System.err.println("Regex error: " + e.getMessage());
95-
return ExprValueUtils.booleanValue(false);
96-
}
88+
}
89+
90+
/** Get compiled pattern from cache or compile and cache it. */
91+
private Pattern getCompiledPattern(String regex) {
92+
// Check cache size and clear if needed (simple LRU-like behavior)
93+
if (patternCache.size() > MAX_CACHE_SIZE) {
94+
patternCache.clear();
9795
}
98-
99-
/**
100-
* Get compiled pattern from cache or compile and cache it.
101-
*/
102-
private Pattern getCompiledPattern(String regex) {
103-
// Check cache size and clear if needed (simple LRU-like behavior)
104-
if (patternCache.size() > MAX_CACHE_SIZE) {
105-
patternCache.clear();
106-
}
107-
108-
return patternCache.computeIfAbsent(regex, r -> {
109-
// Compile with PCRE2 defaults
110-
// pcre4j compiles the pattern with full PCRE2 support
111-
return Pattern.compile(r);
96+
97+
return patternCache.computeIfAbsent(
98+
regex,
99+
r -> {
100+
// Compile with PCRE2 defaults
101+
// pcre4j compiles the pattern with full PCRE2 support
102+
return Pattern.compile(r);
112103
});
113-
}
114-
115-
@Override
116-
public ExprType type() {
117-
return ExprCoreType.BOOLEAN;
118-
}
119-
120-
@Override
121-
public <T, C> T accept(ExpressionNodeVisitor<T, C> visitor, C context) {
122-
// This will be implemented when we add the visitor pattern for expressions
123-
return visitor.visitNode(this, context);
124-
}
125-
}
104+
}
105+
106+
@Override
107+
public ExprType type() {
108+
return ExprCoreType.BOOLEAN;
109+
}
110+
111+
@Override
112+
public <T, C> T accept(ExpressionNodeVisitor<T, C> visitor, C context) {
113+
return visitor.visitRegex(this, context);
114+
}
115+
}

opensearch/src/main/java/org/opensearch/sql/opensearch/storage/scan/OpenSearchIndexScanQueryBuilder.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ public OpenSearchIndexScanQueryBuilder(OpenSearchRequestBuilder requestBuilder)
5757
public boolean pushDownFilter(LogicalFilter filter) {
5858
FilterQueryBuilder queryBuilder = new FilterQueryBuilder(new DefaultExpressionSerializer());
5959
Expression queryCondition = filter.getCondition();
60+
6061
try {
6162
QueryBuilder query = queryBuilder.build(queryCondition);
6263
requestBuilder.pushDownFilter(query);

opensearch/src/main/java/org/opensearch/sql/opensearch/storage/script/filter/FilterQueryBuilder.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.opensearch.sql.expression.ReferenceExpression;
2828
import org.opensearch.sql.expression.function.BuiltinFunctionName;
2929
import org.opensearch.sql.expression.function.FunctionName;
30+
import org.opensearch.sql.expression.operator.predicate.RegexMatch;
3031
import org.opensearch.sql.opensearch.storage.script.CompoundedScriptEngine.ScriptEngineType;
3132
import org.opensearch.sql.opensearch.storage.script.core.ExpressionScript;
3233
import org.opensearch.sql.opensearch.storage.script.filter.lucene.LikeQuery;
@@ -153,4 +154,28 @@ private ScriptQueryBuilder buildScriptQuery(FunctionExpression node) {
153154
SerializationWrapper.wrapWithLangType(ScriptEngineType.V2, serializer.serialize(node)),
154155
emptyMap()));
155156
}
157+
158+
/**
159+
* Visit RegexMatch expression and convert to script query. This allows PCRE regex evaluation to
160+
* be pushed down to OpenSearch data nodes.
161+
*/
162+
public QueryBuilder visitRegex(RegexMatch regexMatch, Object context) {
163+
return buildScriptQueryForRegex(regexMatch);
164+
}
165+
166+
private ScriptQueryBuilder buildScriptQueryForRegex(RegexMatch regexMatch) {
167+
Set<ReferenceExpression> fields = ExpressionScript.extractFields(regexMatch);
168+
if (fields.stream().anyMatch(field -> field.getType() == ExprCoreType.STRUCT)) {
169+
throw new ScriptQueryUnSupportedException(
170+
"Script query does not support fields of struct type in OpenSearch.");
171+
}
172+
173+
return new ScriptQueryBuilder(
174+
new Script(
175+
DEFAULT_SCRIPT_TYPE,
176+
COMPOUNDED_LANG_NAME,
177+
SerializationWrapper.wrapWithLangType(
178+
ScriptEngineType.V2, serializer.serialize(regexMatch)),
179+
emptyMap()));
180+
}
156181
}

0 commit comments

Comments
 (0)