Skip to content

Commit e6ab4fb

Browse files
authored
Calcite patterns command brain pattern method (opensearch-project#3570)
* Revert simple_pattern window function change to recover pushdown ability Signed-off-by: Songkan Tang <songkant@amazon.com> * Add SIMPLE_PATTERN patterns command support based on parse command Signed-off-by: Songkan Tang <songkant@amazon.com> * Address minor comments Signed-off-by: Songkan Tang <songkant@amazon.com> * Address comments part 2 Signed-off-by: Songkan Tang <songkant@amazon.com> * Make allowCast for pattern VARCHAR literal Signed-off-by: Songkan Tang <songkant@amazon.com> * Fix spotless Signed-off-by: Songkan Tang <songkant@amazon.com> * Minor ut failure fix Signed-off-by: Songkan Tang <songkant@amazon.com> * Brain patterns command in Calcite with combined UDF and UDAF Signed-off-by: Songkan Tang <songkant@amazon.com> * Revert debug flag Signed-off-by: Songkan Tang <songkant@amazon.com> * Minor ut failure fix Signed-off-by: Songkan Tang <songkant@amazon.com> * Minor ut failure fix part2 Signed-off-by: Songkan Tang <songkant@amazon.com> * Pick missing ast Window from main Signed-off-by: Songkan Tang <songkant@amazon.com> * Support agg and label mode and new model for patterns command Signed-off-by: Songkan Tang <songkant@amazon.com> * Remove unnecessary files and comments Signed-off-by: Songkan Tang <songkant@amazon.com> * Use uncollect_patterns table function to flatten patterns list Signed-off-by: Songkan Tang <songkant@amazon.com> * Fix partial UT Signed-off-by: Songkan Tang <songkant@amazon.com> * Add 3570 yaml tests Signed-off-by: Songkan Tang <songkant@amazon.com> * Fix plans in explain ITs Signed-off-by: Songkan Tang <songkant@amazon.com> * Fix pushdown ITs failure Signed-off-by: Songkan Tang <songkant@amazon.com> * Fix doctest examples for V2 engine results Signed-off-by: Songkan Tang <songkant@amazon.com> * Minor fix after rebasing Signed-off-by: Songkan Tang <songkant@amazon.com> * Uncomment build.gradle change Signed-off-by: Songkan Tang <songkant@amazon.com> * Address minor comment Signed-off-by: Songkan Tang <songkant@amazon.com> * Address patterns doc comments and fix conflicts Signed-off-by: Songkan Tang <songkant@amazon.com> * Fix doctest Signed-off-by: Songkan Tang <songkant@amazon.com> * Reuse expand command plan to replace hacky uncollect_patterns UDTF Signed-off-by: Songkan Tang <songkant@amazon.com> * Minor fix after resolving merge conflicts Signed-off-by: Songkan Tang <songkant@amazon.com> * Refactor duplicate building expand rel node logic Signed-off-by: Songkan Tang <songkant@amazon.com> * Fix the issue of expand command plan executing main query twice Signed-off-by: Songkan Tang <songkant@amazon.com> --------- Signed-off-by: Songkan Tang <songkant@amazon.com>
1 parent 066e429 commit e6ab4fb

49 files changed

Lines changed: 2321 additions & 476 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

benchmarks/src/jmh/java/org/opensearch/sql/expression/operator/predicate/PatternsWindowFunctionBenchmark.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@
2929
import org.opensearch.sql.expression.Expression;
3030
import org.opensearch.sql.expression.NamedArgumentExpression;
3131
import org.opensearch.sql.expression.ReferenceExpression;
32+
import org.opensearch.sql.expression.parse.PatternsExpression;
3233
import org.opensearch.sql.expression.window.WindowDefinition;
3334
import org.opensearch.sql.expression.window.frame.BufferPatternRowsWindowFrame;
35+
import org.opensearch.sql.expression.window.frame.CurrentRowWindowFrame;
3436
import org.opensearch.sql.expression.window.frame.WindowFrame;
3537

3638
@Warmup(iterations = 1)
@@ -61,6 +63,16 @@ public class PatternsWindowFunctionBenchmark {
6163
new BrainLogParser(),
6264
new NamedArgumentExpression("message", new ReferenceExpression("message", STRING)));
6365

66+
@Benchmark
67+
public void testSimplePattern() {
68+
CurrentRowWindowFrame windowFrame =
69+
new CurrentRowWindowFrame(new WindowDefinition(ImmutableList.of(), ImmutableList.of()));
70+
71+
run(
72+
windowFrame,
73+
new PatternsExpression(DSL.ref("message", STRING), DSL.literal(""), DSL.literal("")));
74+
}
75+
6476
@Benchmark
6577
public void testBrain() {
6678
BufferPatternRowsWindowFrame windowFrame =

common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java

Lines changed: 89 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@
2020
import java.util.regex.Pattern;
2121
import java.util.stream.Collectors;
2222
import java.util.stream.IntStream;
23+
import lombok.Getter;
2324

2425
/** Log parser Brain algorithm implementation. See: https://ieeexplore.ieee.org/document/10109145 */
2526
public class BrainLogParser {
2627

2728
private static final String VARIABLE_DENOTER = "<*>";
28-
private static final Map<Pattern, String> DEFAULT_FILTER_PATTERN_VARIABLE_MAP =
29+
public static final Map<Pattern, String> DEFAULT_FILTER_PATTERN_VARIABLE_MAP =
2930
new LinkedHashMap<>();
3031

3132
static {
@@ -48,26 +49,44 @@ public class BrainLogParser {
4849
Pattern.compile("(?<=[^A-Za-z0-9 ])(-?\\+?\\d+)(?=[^A-Za-z0-9])"), VARIABLE_DENOTER);
4950
}
5051

51-
private static final List<String> DEFAULT_DELIMITERS = List.of(",", "+");
52+
public static final List<String> DEFAULT_DELIMITERS = List.of(",", "+");
5253
// counting frequency will be grouped by composite of position and token string
5354
private static final String POSITIONED_TOKEN_KEY_FORMAT = "%d-%s";
5455
// Token set will be grouped by composite of tokens length per log message, word combination
5556
// candidate and token position.
5657
private static final String GROUP_TOKEN_SET_KEY_FORMAT = "%d-%s-%d";
5758
// By default, algorithm treats more than 2 different tokens in the group per position as variable
5859
// token
59-
private static final int DEFAULT_VARIABLE_COUNT_THRESHOLD = 5;
60+
public static final int DEFAULT_VARIABLE_COUNT_THRESHOLD = 5;
6061
/*
6162
* By default, algorithm treats the longest word combinations as the group root, no matter what its frequency is.
6263
* Otherwise, the longest word combination will be selected when frequency >= highest frequency of log * threshold percentage
6364
*/
64-
private static final float DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE = 0.3f;
65+
public static final float DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE = 0.3f;
6566

66-
private final Map<String, Long> tokenFreqMap;
67-
private final Map<String, Set<String>> groupTokenSetMap;
68-
private final Map<String, String> logIdGroupCandidateMap;
69-
private final int variableCountThreshold;
70-
private final float thresholdPercentage;
67+
/**
68+
* -- GETTER -- Get token histogram
69+
*
70+
* @return map of token per position key and its frequency
71+
*/
72+
@Getter private final Map<String, Long> tokenFreqMap;
73+
74+
/**
75+
* -- GETTER -- Get group per length per position to its token set map
76+
*
77+
* @return map of pattern group per length per position key and its token set
78+
*/
79+
@Getter private final Map<String, Set<String>> groupTokenSetMap;
80+
81+
/**
82+
* -- GETTER -- Get logId to its group candidate map
83+
*
84+
* @return map of logId and group candidate
85+
*/
86+
@Getter private final Map<String, String> logIdGroupCandidateMap;
87+
88+
@Getter private int variableCountThreshold;
89+
@Getter private float thresholdPercentage;
7190
private final Map<Pattern, String> filterPatternVariableMap;
7291
private final List<String> delimiters;
7392

@@ -151,6 +170,15 @@ public List<String> preprocess(String logMessage, String logId) {
151170
if (logMessage == null || logId == null) {
152171
throw new IllegalArgumentException("log message or logId must not be null");
153172
}
173+
174+
List<String> tokens = preprocess(logMessage, this.filterPatternVariableMap, this.delimiters);
175+
tokens.add(logId);
176+
177+
return tokens;
178+
}
179+
180+
public static List<String> preprocess(
181+
String logMessage, Map<Pattern, String> filterPatternVariableMap, List<String> delimiters) {
154182
// match regex and replace it with variable denoter in order
155183
for (Map.Entry<Pattern, String> patternVariablePair : filterPatternVariableMap.entrySet()) {
156184
logMessage =
@@ -165,9 +193,9 @@ public List<String> preprocess(String logMessage, String logId) {
165193
}
166194

167195
// Append logId/docId to the end of the split tokens
168-
logMessage = logMessage.trim() + " " + logId;
196+
logMessage = logMessage.trim();
169197

170-
return Arrays.asList(logMessage.split("\\s+"));
198+
return new ArrayList<>(Arrays.asList(logMessage.split("\\s+")));
171199
}
172200

173201
/**
@@ -240,8 +268,22 @@ void calculateGroupTokenFreq(List<List<String>> preprocessedLogs) {
240268
* @return parsed log pattern that is a list of string
241269
*/
242270
public List<String> parseLogPattern(List<String> tokens) {
271+
return parseLogPattern(
272+
tokens,
273+
this.tokenFreqMap,
274+
this.groupTokenSetMap,
275+
this.logIdGroupCandidateMap,
276+
this.variableCountThreshold);
277+
}
278+
279+
public static List<String> parseLogPattern(
280+
List<String> tokens,
281+
Map<String, Long> tokenFreqMap,
282+
Map<String, Set<String>> groupTokenSetMap,
283+
Map<String, String> logIdGroupCandidateMap,
284+
int variableCountThreshold) {
243285
String logId = tokens.get(tokens.size() - 1);
244-
String groupCandidateStr = this.logIdGroupCandidateMap.get(logId);
286+
String groupCandidateStr = logIdGroupCandidateMap.get(logId);
245287
String[] groupCandidate = groupCandidateStr.split(",");
246288
Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group
247289
return IntStream.range(0, tokens.size() - 1)
@@ -252,26 +294,26 @@ public List<String> parseLogPattern(List<String> tokens) {
252294
String token = entry.getValue();
253295
String tokenKey =
254296
String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, index, token);
255-
assert this.tokenFreqMap.get(tokenKey) != null
297+
assert tokenFreqMap.get(tokenKey) != null
256298
: String.format(Locale.ROOT, "Not found token: %s on position %d", token, index);
257299

258-
boolean isHigherFrequency = this.tokenFreqMap.get(tokenKey) > repFreq;
259-
boolean isLowerFrequency = this.tokenFreqMap.get(tokenKey) < repFreq;
300+
boolean isHigherFrequency = tokenFreqMap.get(tokenKey) > repFreq;
301+
boolean isLowerFrequency = tokenFreqMap.get(tokenKey) < repFreq;
260302
String groupTokenKey =
261303
String.format(
262304
Locale.ROOT,
263305
GROUP_TOKEN_SET_KEY_FORMAT,
264306
tokens.size() - 1,
265307
groupCandidateStr,
266308
index);
267-
assert this.groupTokenSetMap.get(groupTokenKey) != null
309+
assert groupTokenSetMap.get(groupTokenKey) != null
268310
: String.format(Locale.ROOT, "Not found any token in group: %s", groupTokenKey);
269311

270312
if (isHigherFrequency) {
271313
// For higher frequency token that doesn't belong to word combination, it's likely
272314
// to be constant token only if
273315
// it's unique token on that position within the group
274-
boolean isUniqueToken = this.groupTokenSetMap.get(groupTokenKey).size() == 1;
316+
boolean isUniqueToken = groupTokenSetMap.get(groupTokenKey).size() == 1;
275317
if (!isUniqueToken) {
276318
return VARIABLE_DENOTER;
277319
}
@@ -281,7 +323,7 @@ public List<String> parseLogPattern(List<String> tokens) {
281323
// it doesn't exceed the preset variable count threshold. For example, some variable
282324
// are limited number of enums,
283325
// and sometimes they could be treated as constant tokens.
284-
if (this.groupTokenSetMap.get(groupTokenKey).size() >= variableCountThreshold) {
326+
if (groupTokenSetMap.get(groupTokenKey).size() >= variableCountThreshold) {
285327
return VARIABLE_DENOTER;
286328
}
287329
}
@@ -296,46 +338,42 @@ public List<String> parseLogPattern(List<String> tokens) {
296338
* @param logMessages all lines of log messages
297339
* @return log pattern map with log pattern string as key, grouped logIds as value
298340
*/
299-
public Map<String, List<String>> parseAllLogPatterns(List<String> logMessages) {
341+
public Map<String, Map<String, Object>> parseAllLogPatterns(
342+
List<String> logMessages, int maxSampleCount) {
300343
List<List<String>> processedMessages = this.preprocessAllLogs(logMessages);
301344

302-
Map<String, List<String>> logPatternMap = new HashMap<>();
303-
for (List<String> processedMessage : processedMessages) {
304-
String logId = processedMessage.get(processedMessage.size() - 1);
305-
List<String> logPattern = this.parseLogPattern(processedMessage);
345+
Map<String, Map<String, Object>> logPatternMap = new HashMap<>();
346+
for (int i = 0; i < processedMessages.size(); i++) {
347+
List<String> logPattern = this.parseLogPattern(processedMessages.get(i));
306348
String patternKey = String.join(" ", logPattern);
307-
logPatternMap.computeIfAbsent(patternKey, k -> new ArrayList<>()).add(logId);
349+
String sampleLog = logMessages.get(i);
350+
logPatternMap.compute(
351+
patternKey,
352+
(key, stats) -> {
353+
if (stats == null) {
354+
Map<String, Object> newStats = new HashMap<>();
355+
newStats.put(PatternUtils.PATTERN, key);
356+
newStats.put(PatternUtils.PATTERN_COUNT, 1L);
357+
List<String> samples = new ArrayList<>();
358+
if (sampleLog != null && samples.size() < maxSampleCount) {
359+
samples.add(sampleLog);
360+
}
361+
newStats.put(PatternUtils.SAMPLE_LOGS, samples);
362+
return newStats;
363+
} else {
364+
stats.put(
365+
PatternUtils.PATTERN_COUNT, ((Long) stats.get(PatternUtils.PATTERN_COUNT)) + 1);
366+
List<String> samples = (List<String>) stats.get(PatternUtils.SAMPLE_LOGS);
367+
if (sampleLog != null && samples.size() < maxSampleCount) {
368+
samples.add(sampleLog);
369+
}
370+
return stats;
371+
}
372+
});
308373
}
309374
return logPatternMap;
310375
}
311376

312-
/**
313-
* Get token histogram
314-
*
315-
* @return map of token per position key and its frequency
316-
*/
317-
public Map<String, Long> getTokenFreqMap() {
318-
return this.tokenFreqMap;
319-
}
320-
321-
/**
322-
* Get group per length per position to its token set map
323-
*
324-
* @return map of pattern group per length per position key and its token set
325-
*/
326-
public Map<String, Set<String>> getGroupTokenSetMap() {
327-
return this.groupTokenSetMap;
328-
}
329-
330-
/**
331-
* Get logId to its group candidate map
332-
*
333-
* @return map of logId and group candidate
334-
*/
335-
public Map<String, String> getLogIdGroupCandidateMap() {
336-
return this.logIdGroupCandidateMap;
337-
}
338-
339377
private Map<Long, Integer> getWordOccurrences(List<String> tokens) {
340378
Map<Long, Integer> occurrences = new HashMap<>();
341379
for (int i = 0; i < tokens.size() - 1; i++) {

0 commit comments

Comments
 (0)