Skip to content

Commit 8b8da3f

Browse files
committed
[refactor] refactor some regex fn into a util class for re-usage
Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent 4ed8225 commit 8b8da3f

4 files changed

Lines changed: 153 additions & 51 deletions

File tree

core/src/main/java/org/opensearch/sql/expression/function/udf/RegexMatchFunctionImpl.java

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
package org.opensearch.sql.expression.function.udf;
77

88
import java.util.List;
9-
import java.util.regex.Pattern;
109
import java.util.regex.PatternSyntaxException;
1110
import org.apache.calcite.adapter.enumerable.NotNullImplementor;
1211
import org.apache.calcite.adapter.enumerable.NullPolicy;
@@ -67,11 +66,9 @@ public static Boolean eval(String field, String pattern) {
6766
return null;
6867
}
6968

70-
// Match using find() for partial match semantics
69+
// Use shared utility for consistent regex matching
7170
try {
72-
Pattern compiledPattern = Pattern.compile(pattern);
73-
java.util.regex.Matcher matcher = compiledPattern.matcher(field);
74-
return matcher.find(); // Use find() for partial match like SPL
71+
return org.opensearch.sql.expression.parse.RegexCommonUtils.matchesPartial(field, pattern);
7572
} catch (PatternSyntaxException e) {
7673
throw new IllegalArgumentException("Invalid regex pattern: " + e.getMessage());
7774
}

core/src/main/java/org/opensearch/sql/expression/operator/predicate/RegexMatch.java

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@
55

66
package org.opensearch.sql.expression.operator.predicate;
77

8-
import java.util.concurrent.ConcurrentHashMap;
9-
import java.util.regex.Matcher;
10-
import java.util.regex.Pattern;
118
import java.util.regex.PatternSyntaxException;
129
import lombok.EqualsAndHashCode;
1310
import lombok.Getter;
@@ -19,6 +16,7 @@
1916
import org.opensearch.sql.expression.Expression;
2017
import org.opensearch.sql.expression.ExpressionNodeVisitor;
2118
import org.opensearch.sql.expression.env.Environment;
19+
import org.opensearch.sql.expression.parse.RegexCommonUtils;
2220

2321
/**
2422
* Expression for regex matching using Java's built-in regex engine. Supports standard Java regex
@@ -34,12 +32,6 @@ public class RegexMatch implements Expression {
3432

3533
@Getter private final boolean negated;
3634

37-
// Pattern cache to avoid recompiling the same patterns
38-
private static final ConcurrentHashMap<String, Pattern> patternCache = new ConcurrentHashMap<>();
39-
40-
// Maximum cache size to prevent memory issues
41-
private static final int MAX_CACHE_SIZE = 1000;
42-
4335
public RegexMatch(Expression field, Expression pattern, boolean negated) {
4436
this.field = field;
4537
this.pattern = pattern;
@@ -59,16 +51,17 @@ public ExprValue valueOf(Environment<Expression, ExprValue> valueEnv) {
5951
return ExprValueUtils.booleanValue(false);
6052
}
6153

62-
String text = fieldValue.stringValue();
54+
// Convert field value to string (handles non-string types)
55+
String text = RegexCommonUtils.toStringValue(fieldValue);
6356
String regex = patternValue.stringValue();
6457

65-
try {
66-
// Get compiled pattern from cache or compile new one
67-
Pattern compiledPattern = getCompiledPattern(regex);
58+
if (text == null) {
59+
return ExprValueUtils.booleanValue(false);
60+
}
6861

69-
// Create matcher and check for match
70-
Matcher matcher = compiledPattern.matcher(text);
71-
boolean matches = matcher.find(); // Use find() for partial match like SPL
62+
try {
63+
// Use shared utility for pattern matching
64+
boolean matches = RegexCommonUtils.matchesPartial(text, regex);
7265

7366
// Apply negation if needed
7467
return ExprValueUtils.booleanValue(negated ? !matches : matches);
@@ -78,21 +71,6 @@ public ExprValue valueOf(Environment<Expression, ExprValue> valueEnv) {
7871
}
7972
}
8073

81-
/** Get compiled pattern from cache or compile and cache it. */
82-
private Pattern getCompiledPattern(String regex) {
83-
// Check cache size and clear if needed (simple LRU-like behavior)
84-
if (patternCache.size() > MAX_CACHE_SIZE) {
85-
patternCache.clear();
86-
}
87-
88-
return patternCache.computeIfAbsent(
89-
regex,
90-
r -> {
91-
// Compile with Java regex engine
92-
return Pattern.compile(r);
93-
});
94-
}
95-
9674
@Override
9775
public ExprType type() {
9876
return ExprCoreType.BOOLEAN;
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.expression.parse;
7+
8+
import com.google.common.collect.ImmutableList;
9+
import java.util.List;
10+
import java.util.concurrent.ConcurrentHashMap;
11+
import java.util.regex.Matcher;
12+
import java.util.regex.Pattern;
13+
import java.util.regex.PatternSyntaxException;
14+
import org.opensearch.sql.data.model.ExprValue;
15+
import org.opensearch.sql.data.type.ExprCoreType;
16+
17+
/**
18+
* Common utilities for regex operations. Provides pattern caching and consistent matching behavior.
19+
*/
20+
public class RegexCommonUtils {
21+
22+
// Pattern to identify named capture groups in regex
23+
private static final Pattern NAMED_GROUP_PATTERN =
24+
Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>");
25+
26+
// Pattern cache to avoid recompiling the same patterns
27+
private static final ConcurrentHashMap<String, Pattern> patternCache = new ConcurrentHashMap<>();
28+
29+
// Maximum cache size to prevent memory issues
30+
private static final int MAX_CACHE_SIZE = 1000;
31+
32+
/**
33+
* Get compiled pattern from cache or compile and cache it.
34+
*
35+
* @param regex The regex pattern string
36+
* @return Compiled Pattern object
37+
* @throws PatternSyntaxException if the regex is invalid
38+
*/
39+
public static Pattern getCompiledPattern(String regex) {
40+
// Check cache size and clear if needed (simple LRU-like behavior)
41+
if (patternCache.size() > MAX_CACHE_SIZE) {
42+
patternCache.clear();
43+
}
44+
45+
return patternCache.computeIfAbsent(regex, Pattern::compile);
46+
}
47+
48+
/**
49+
* Extract list of named group candidates from a regex pattern.
50+
*
51+
* @param pattern The regex pattern string
52+
* @return List of named group names found in the pattern
53+
*/
54+
public static List<String> getNamedGroupCandidates(String pattern) {
55+
ImmutableList.Builder<String> namedGroups = ImmutableList.builder();
56+
Matcher m = NAMED_GROUP_PATTERN.matcher(pattern);
57+
while (m.find()) {
58+
namedGroups.add(m.group(1));
59+
}
60+
return namedGroups.build();
61+
}
62+
63+
/**
64+
* Match using find() for partial match semantics with string pattern.
65+
*
66+
* @param text The text to match against
67+
* @param patternStr The pattern string
68+
* @return true if pattern is found anywhere in the text
69+
* @throws PatternSyntaxException if the regex is invalid
70+
*/
71+
public static boolean matchesPartial(String text, String patternStr) {
72+
if (text == null || patternStr == null) {
73+
return false;
74+
}
75+
Pattern pattern = getCompiledPattern(patternStr);
76+
return pattern.matcher(text).find();
77+
}
78+
79+
/**
80+
* Extract a specific named group from text using the pattern. Used by parse command regex method.
81+
*
82+
* @param text The text to extract from
83+
* @param pattern The compiled pattern with named groups
84+
* @param groupName The name of the group to extract
85+
* @return The extracted value or null if not found
86+
*/
87+
public static String extractNamedGroup(String text, Pattern pattern, String groupName) {
88+
if (text == null || pattern == null || groupName == null) {
89+
return null;
90+
}
91+
92+
Matcher matcher = pattern.matcher(text);
93+
94+
// Use matches() for parse command (full match required)
95+
if (matcher.matches()) {
96+
try {
97+
return matcher.group(groupName);
98+
} catch (IllegalArgumentException e) {
99+
// Group name not found
100+
return null;
101+
}
102+
}
103+
104+
return null;
105+
}
106+
107+
/**
108+
* Convert ExprValue to string, handling non-string types. Based on decision to auto-convert
109+
* non-string fields to strings.
110+
*
111+
* @param value The ExprValue to convert
112+
* @return String representation of the value
113+
*/
114+
public static String toStringValue(ExprValue value) {
115+
if (value == null || value.isNull() || value.isMissing()) {
116+
return null;
117+
}
118+
119+
// If already a string, return it directly
120+
if (value.type() == ExprCoreType.STRING) {
121+
return value.stringValue();
122+
}
123+
124+
// Auto-convert non-string types to string
125+
return value.value().toString();
126+
}
127+
}

core/src/main/java/org/opensearch/sql/expression/parse/RegexExpression.java

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55

66
package org.opensearch.sql.expression.parse;
77

8-
import com.google.common.collect.ImmutableList;
98
import java.util.List;
10-
import java.util.regex.Matcher;
119
import java.util.regex.Pattern;
1210
import lombok.EqualsAndHashCode;
1311
import lombok.Getter;
@@ -24,7 +22,6 @@
2422
@ToString
2523
public class RegexExpression extends ParseExpression {
2624
private static final Logger log = LogManager.getLogger(RegexExpression.class);
27-
private static final Pattern GROUP_PATTERN = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>");
2825
@Getter @EqualsAndHashCode.Exclude private final Pattern regexPattern;
2926

3027
/**
@@ -36,32 +33,35 @@ public class RegexExpression extends ParseExpression {
3633
*/
3734
public RegexExpression(Expression sourceField, Expression pattern, Expression identifier) {
3835
super("regex", sourceField, pattern, identifier);
39-
this.regexPattern = Pattern.compile(pattern.valueOf().stringValue());
36+
this.regexPattern = RegexCommonUtils.getCompiledPattern(pattern.valueOf().stringValue());
4037
}
4138

4239
@Override
4340
ExprValue parseValue(ExprValue value) throws ExpressionEvaluationException {
44-
String rawString = value.stringValue();
45-
Matcher matcher = regexPattern.matcher(rawString);
46-
if (matcher.matches()) {
47-
return new ExprStringValue(matcher.group(identifierStr));
41+
// Convert to string (handles non-string types)
42+
String rawString = RegexCommonUtils.toStringValue(value);
43+
if (rawString == null) {
44+
return new ExprStringValue("");
4845
}
46+
47+
// Extract the specific named group
48+
String extracted = RegexCommonUtils.extractNamedGroup(rawString, regexPattern, identifierStr);
49+
50+
if (extracted != null) {
51+
return new ExprStringValue(extracted);
52+
}
53+
4954
log.debug("failed to extract pattern {} from input ***", regexPattern.pattern());
5055
return new ExprStringValue("");
5156
}
5257

5358
/**
54-
* Get list of derived fields based on parse pattern.
59+
* Get list of derived fields based on parse pattern. Delegates to shared utility.
5560
*
5661
* @param pattern pattern used for parsing
5762
* @return list of names of the derived fields
5863
*/
5964
public static List<String> getNamedGroupCandidates(String pattern) {
60-
ImmutableList.Builder<String> namedGroups = ImmutableList.builder();
61-
Matcher m = GROUP_PATTERN.matcher(pattern);
62-
while (m.find()) {
63-
namedGroups.add(m.group(1));
64-
}
65-
return namedGroups.build();
65+
return RegexCommonUtils.getNamedGroupCandidates(pattern);
6666
}
6767
}

0 commit comments

Comments
 (0)