Skip to content

Commit c87f897

Browse files
feat: Generate all possible wildcard subqueries for non-capture schema-based search, preventing unnecessary archive decompression. (#1313)
Co-authored-by: SharafMohamed <SharafMohamed@users.noreply.github.com> Co-authored-by: davidlion <david.lion@yscope.com>
1 parent 309f125 commit c87f897

16 files changed

Lines changed: 1740 additions & 293 deletions

components/core/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,8 @@ set(SOURCE_FILES_unitTest
575575
src/clp/GrepCore.hpp
576576
src/clp/hash_utils.cpp
577577
src/clp/hash_utils.hpp
578+
src/clp/SchemaSearcher.cpp
579+
src/clp/SchemaSearcher.hpp
578580
src/clp/ir/constants.hpp
579581
src/clp/ir/EncodedTextAst.cpp
580582
src/clp/ir/EncodedTextAst.hpp
@@ -704,6 +706,11 @@ set(SOURCE_FILES_unitTest
704706
tests/clp_s_test_utils.cpp
705707
tests/clp_s_test_utils.hpp
706708
tests/LogSuppressor.hpp
709+
tests/MockLogTypeDictionary.hpp
710+
tests/MockVariableDictionary.hpp
711+
tests/SchemaSearcherTest.hpp
712+
tests/search_test_utils.cpp
713+
tests/search_test_utils.hpp
707714
tests/TestOutputCleaner.hpp
708715
tests/test-BoundedReader.cpp
709716
tests/test-BufferedReader.cpp
@@ -731,6 +738,7 @@ set(SOURCE_FILES_unitTest
731738
tests/test-ParserWithUserSchema.cpp
732739
tests/test-query_methods.cpp
733740
tests/test-regex_utils.cpp
741+
tests/test-SchemaSearcher.cpp
734742
tests/test-Segment.cpp
735743
tests/test-SQLiteDB.cpp
736744
tests/test-Stopwatch.cpp
@@ -790,4 +798,7 @@ if(CLP_ENABLE_TESTS)
790798
target_compile_features(unitTest
791799
PRIVATE cxx_std_20
792800
)
801+
target_compile_definitions(unitTest
802+
PRIVATE CLP_ENABLE_TESTS
803+
)
793804
endif()
Lines changed: 1 addition & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,19 @@
11
#include "GrepCore.hpp"
22

33
#include <cstddef>
4-
#include <set>
54
#include <string>
65
#include <string_view>
76

8-
#include <log_surgeon/Constants.hpp>
97
#include <string_utils/string_utils.hpp>
108

11-
#include "ir/parsing.hpp"
12-
#include "LogSurgeonReader.hpp"
13-
#include "QueryToken.hpp"
14-
#include "StringReader.hpp"
9+
#include <clp/ir/parsing.hpp>
1510

1611
using clp::ir::is_delim;
1712
using clp::string_utils::is_alphabet;
1813
using clp::string_utils::is_wildcard;
1914
using std::string;
2015

2116
namespace clp {
22-
namespace {
23-
/**
24-
* Wraps the tokens returned from the log_surgeon lexer, and stores the variable ids of the tokens
25-
* in a search query in a set. This allows for optimized search performance.
26-
*/
27-
class SearchToken : public log_surgeon::Token {
28-
public:
29-
std::set<int> m_type_ids_set;
30-
};
31-
} // namespace
32-
3317
bool GrepCore::get_bounds_of_next_potential_var(
3418
string const& value,
3519
size_t& begin_pos,
@@ -149,125 +133,4 @@ bool GrepCore::get_bounds_of_next_potential_var(
149133

150134
return (value_length != begin_pos);
151135
}
152-
153-
bool GrepCore::get_bounds_of_next_potential_var(
154-
string const& value,
155-
size_t& begin_pos,
156-
size_t& end_pos,
157-
bool& is_var,
158-
log_surgeon::lexers::ByteLexer& lexer
159-
) {
160-
size_t const value_length = value.length();
161-
if (end_pos >= value_length) {
162-
return false;
163-
}
164-
165-
is_var = false;
166-
bool contains_wildcard = false;
167-
while (false == is_var && false == contains_wildcard && begin_pos < value_length) {
168-
// Start search at end of last token
169-
begin_pos = end_pos;
170-
171-
// Find variable begin or wildcard
172-
bool is_escaped = false;
173-
for (; begin_pos < value_length; ++begin_pos) {
174-
char c = value[begin_pos];
175-
176-
if (is_escaped) {
177-
is_escaped = false;
178-
179-
if (false == lexer.is_delimiter(c)) {
180-
// Found escaped non-delimiter, so reverse the index to retain the escape
181-
// character
182-
--begin_pos;
183-
break;
184-
}
185-
} else if ('\\' == c) {
186-
// Escape character
187-
is_escaped = true;
188-
} else {
189-
if (is_wildcard(c)) {
190-
contains_wildcard = true;
191-
break;
192-
}
193-
if (false == lexer.is_delimiter(c)) {
194-
break;
195-
}
196-
}
197-
}
198-
199-
// Find next delimiter
200-
is_escaped = false;
201-
end_pos = begin_pos;
202-
for (; end_pos < value_length; ++end_pos) {
203-
char c = value[end_pos];
204-
205-
if (is_escaped) {
206-
is_escaped = false;
207-
208-
if (lexer.is_delimiter(c)) {
209-
// Found escaped delimiter, so reverse the index to retain the escape character
210-
--end_pos;
211-
break;
212-
}
213-
} else if ('\\' == c) {
214-
// Escape character
215-
is_escaped = true;
216-
} else {
217-
if (is_wildcard(c)) {
218-
contains_wildcard = true;
219-
} else if (lexer.is_delimiter(c)) {
220-
// Found delimiter that's not also a wildcard
221-
break;
222-
}
223-
}
224-
}
225-
226-
if (end_pos > begin_pos) {
227-
bool has_prefix_wildcard = ('*' == value[begin_pos]) || ('?' == value[begin_pos]);
228-
bool has_suffix_wildcard = ('*' == value[end_pos - 1]) || ('?' == value[end_pos - 1]);
229-
bool has_wildcard_in_middle = false;
230-
for (size_t i = begin_pos + 1; i < end_pos - 1; ++i) {
231-
if (('*' == value[i] || '?' == value[i]) && value[i - 1] != '\\') {
232-
has_wildcard_in_middle = true;
233-
break;
234-
}
235-
}
236-
SearchToken search_token;
237-
if (has_wildcard_in_middle || has_prefix_wildcard) {
238-
// DO NOTHING
239-
} else {
240-
StringReader string_reader;
241-
LogSurgeonReader reader_wrapper(string_reader);
242-
log_surgeon::ParserInputBuffer parser_input_buffer;
243-
if (has_suffix_wildcard) { // text*
244-
// TODO: creating a string reader, setting it equal to a string, to read it into
245-
// the ParserInputBuffer, seems like a convoluted way to set a string equal to a
246-
// string, should be improved when adding a SearchParser to log_surgeon
247-
string_reader.open(value.substr(begin_pos, end_pos - begin_pos - 1));
248-
parser_input_buffer.read_if_safe(reader_wrapper);
249-
lexer.reset();
250-
lexer.scan_with_wildcard(parser_input_buffer, value[end_pos - 1], search_token);
251-
} else { // no wildcards
252-
string_reader.open(value.substr(begin_pos, end_pos - begin_pos));
253-
parser_input_buffer.read_if_safe(reader_wrapper);
254-
lexer.reset();
255-
auto [err, token] = lexer.scan(parser_input_buffer);
256-
if (log_surgeon::ErrorCode::Success != err) {
257-
return false;
258-
}
259-
search_token = SearchToken{token.value()};
260-
search_token.m_type_ids_set.insert(search_token.get_type_ids()->at(0));
261-
}
262-
auto const& type = search_token.get_type_ids()->at(0);
263-
if (type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
264-
&& type != static_cast<int>(log_surgeon::SymbolId::TokenEnd))
265-
{
266-
is_var = true;
267-
}
268-
}
269-
}
270-
}
271-
return (value_length != begin_pos);
272-
}
273136
} // namespace clp

components/core/src/clp/GrepCore.hpp

Lines changed: 67 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,15 @@
1313
#include <string_utils/constants.hpp>
1414
#include <string_utils/string_utils.hpp>
1515

16-
#include "Defs.h"
17-
#include "EncodedVariableInterpreter.hpp"
18-
#include "ir/parsing.hpp"
19-
#include "ir/types.hpp"
20-
#include "LogTypeDictionaryReaderReq.hpp"
21-
#include "Query.hpp"
22-
#include "QueryToken.hpp"
23-
#include "VariableDictionaryReaderReq.hpp"
16+
#include <clp/Defs.h>
17+
#include <clp/EncodedVariableInterpreter.hpp>
18+
#include <clp/ir/parsing.hpp>
19+
#include <clp/ir/types.hpp>
20+
#include <clp/LogTypeDictionaryReaderReq.hpp>
21+
#include <clp/Query.hpp>
22+
#include <clp/QueryToken.hpp>
23+
#include <clp/SchemaSearcher.hpp>
24+
#include <clp/VariableDictionaryReaderReq.hpp>
2425

2526
namespace clp {
2627
class GrepCore {
@@ -75,24 +76,6 @@ class GrepCore {
7576
bool& is_var
7677
);
7778

78-
/**
79-
* Returns bounds of next potential variable (either a definite variable or a token with
80-
* wildcards)
81-
* @param value String containing token
82-
* @param begin_pos Begin position of last token, changes to begin position of next token
83-
* @param end_pos End position of last token, changes to end position of next token
84-
* @param is_var Whether the token is definitely a variable
85-
* @param lexer DFA for determining if input is in the schema
86-
* @return true if another potential variable was found, false otherwise
87-
*/
88-
static bool get_bounds_of_next_potential_var(
89-
std::string const& value,
90-
size_t& begin_pos,
91-
size_t& end_pos,
92-
bool& is_var,
93-
log_surgeon::lexers::ByteLexer& lexer
94-
);
95-
9679
private:
9780
// Types
9881
enum class SubQueryMatchabilityResult : uint8_t {
@@ -163,13 +146,18 @@ std::optional<Query> GrepCore::process_raw_query(
163146
log_surgeon::lexers::ByteLexer& lexer,
164147
bool use_heuristic
165148
) {
166-
// Split search_string into tokens with wildcards
167-
std::vector<QueryToken> query_tokens;
168-
size_t begin_pos = 0;
169-
size_t end_pos = 0;
170-
bool is_var;
171-
std::string search_string_for_sub_queries{search_string};
172-
if (use_heuristic) {
149+
std::vector<SubQuery> sub_queries;
150+
if (false == use_heuristic) {
151+
sub_queries
152+
= SchemaSearcher::search(search_string, lexer, logtype_dict, var_dict, ignore_case);
153+
} else {
154+
// Split search_string into tokens with wildcards
155+
std::vector<QueryToken> query_tokens;
156+
size_t begin_pos{0};
157+
size_t end_pos{0};
158+
bool is_var{false};
159+
std::string search_string_for_sub_queries{search_string};
160+
173161
// Replace unescaped '?' wildcards with '*' wildcards since we currently have no support for
174162
// generating sub-queries with '?' wildcards. The final wildcard match on the decompressed
175163
// message uses the original wildcards, so correctness will be maintained.
@@ -192,70 +180,55 @@ std::optional<Query> GrepCore::process_raw_query(
192180
{
193181
query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var);
194182
}
195-
} else {
196-
while (get_bounds_of_next_potential_var(
197-
search_string_for_sub_queries,
198-
begin_pos,
199-
end_pos,
200-
is_var,
201-
lexer
202-
))
203-
{
204-
query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var);
205-
}
206-
}
207-
208-
// Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since we
209-
// fall-back to decompression + wildcard matching for those.
210-
std::vector<QueryToken*> ambiguous_tokens;
211-
for (auto& query_token : query_tokens) {
212-
if (!query_token.has_greedy_wildcard_in_middle() && query_token.is_ambiguous_token()) {
213-
ambiguous_tokens.push_back(&query_token);
183+
// Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since
184+
// we fall-back to decompression + wildcard matching for those.
185+
std::vector<QueryToken*> ambiguous_tokens;
186+
for (auto& query_token : query_tokens) {
187+
if (false == query_token.has_greedy_wildcard_in_middle()
188+
&& query_token.is_ambiguous_token())
189+
{
190+
ambiguous_tokens.push_back(&query_token);
191+
}
214192
}
215-
}
216193

217-
// Generate a sub-query for each combination of ambiguous tokens
218-
// E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we need
219-
// to create:
220-
// - (token1 as logtype) (token2 as logtype)
221-
// - (token1 as logtype) (token2 as var)
222-
// - (token1 as var) (token2 as logtype)
223-
// - (token1 as var) (token2 as var)
224-
std::vector<SubQuery> sub_queries;
225-
std::string logtype;
226-
bool type_of_one_token_changed = true;
227-
while (type_of_one_token_changed) {
228-
SubQuery sub_query;
229-
230-
// Compute logtypes and variables for query
231-
auto matchability = generate_logtypes_and_vars_for_subquery(
232-
logtype_dict,
233-
var_dict,
234-
search_string_for_sub_queries,
235-
query_tokens,
236-
ignore_case,
237-
sub_query
238-
);
239-
switch (matchability) {
240-
case SubQueryMatchabilityResult::SupercedesAllSubQueries:
241-
// Since other sub-queries will be superceded by this one, we can stop processing
242-
// now
243-
return Query{search_begin_ts, search_end_ts, ignore_case, search_string, {}};
244-
case SubQueryMatchabilityResult::MayMatch:
245-
sub_queries.push_back(std::move(sub_query));
246-
break;
247-
case SubQueryMatchabilityResult::WontMatch:
248-
default:
249-
// Do nothing
250-
break;
251-
}
194+
// Generate a sub-query for each combination of ambiguous tokens
195+
// E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we
196+
// need to create:
197+
// - (token1 as logtype) (token2 as logtype)
198+
// - (token1 as logtype) (token2 as var)
199+
// - (token1 as var) (token2 as logtype)
200+
// - (token1 as var) (token2 as var)
201+
bool type_of_one_token_changed{true};
202+
while (type_of_one_token_changed) {
203+
SubQuery sub_query;
204+
auto matchability{generate_logtypes_and_vars_for_subquery(
205+
logtype_dict,
206+
var_dict,
207+
search_string_for_sub_queries,
208+
query_tokens,
209+
ignore_case,
210+
sub_query
211+
)};
212+
switch (matchability) {
213+
case SubQueryMatchabilityResult::SupercedesAllSubQueries:
214+
// Since other sub-queries will be superceded by this one, we can stop
215+
// processing now.
216+
return Query{search_begin_ts, search_end_ts, ignore_case, search_string, {}};
217+
case SubQueryMatchabilityResult::MayMatch:
218+
sub_queries.push_back(std::move(sub_query));
219+
break;
220+
case SubQueryMatchabilityResult::WontMatch:
221+
default:
222+
break;
223+
}
252224

253-
// Update combination of ambiguous tokens
254-
type_of_one_token_changed = false;
255-
for (auto* ambiguous_token : ambiguous_tokens) {
256-
if (ambiguous_token->change_to_next_possible_type()) {
257-
type_of_one_token_changed = true;
258-
break;
225+
// Update combination of ambiguous tokens
226+
type_of_one_token_changed = false;
227+
for (auto* ambiguous_token : ambiguous_tokens) {
228+
if (ambiguous_token->change_to_next_possible_type()) {
229+
type_of_one_token_changed = true;
230+
break;
231+
}
259232
}
260233
}
261234
}

0 commit comments

Comments
 (0)