|
1 | 1 | #include "RegexImpl.hpp" |
2 | 2 |
|
| 3 | +#include <uni_algo/conv.h> |
| 4 | + |
3 | 5 | namespace rdf4cpp::regex { |
4 | 6 |
|
5 | | -namespace detail { |
| 7 | + Regex::Impl::Impl(std::string_view const regex, flag_type const flags) |
| 8 | + : Impl{make(regex, flags)} { |
| 9 | + } |
| 10 | + |
| 11 | + TriBool Regex::Impl::regex_match(std::string_view const str) const noexcept { |
| 12 | + return apply(*match, str); |
| 13 | + } |
6 | 14 |
|
7 | | -/** |
8 | | - * Translates the regex flags of rdf4cpp's public interface to RE2 Options. |
9 | | - * |
10 | | - * @param flags the rdf4cpp flags to translate |
11 | | - * @return the translated RE2 Options |
12 | | - */ |
13 | | -static re2::RE2::Options translate_flags(Regex::flag_type const flags) { |
14 | | - re2::RE2::Options o; |
15 | | - o.set_log_errors(false); |
16 | | - o.set_dot_nl(flags.contains(RegexFlag::DotAll)); |
17 | | - o.set_case_sensitive(!flags.contains(RegexFlag::CaseInsensitive)); |
18 | | - o.set_literal(flags.contains(RegexFlag::Literal)); |
| 15 | + TriBool Regex::Impl::regex_search(std::string_view const str) const noexcept { |
| 16 | + return apply(*search, str); |
| 17 | + } |
19 | 18 |
|
20 | | - return o; |
21 | | -} |
| 19 | + pcre2_match_context_8 &Regex::Impl::get_match_context() { |
| 20 | + using match_ctx_ptr = std::unique_ptr<pcre2_match_context_8, CallFree<pcre2_match_context_free_8>>; |
| 21 | + static auto match_ctx = []() { |
| 22 | + match_ctx_ptr r{pcre2_match_context_create_8(nullptr)}; |
| 23 | + pcre2_set_heap_limit_8(r.get(), 32); // 32 KB |
| 24 | + pcre2_set_match_limit_8(r.get(), 100000); |
| 25 | + // jit stack limit is 32 KB (to change it, match_ctx needs to be thread local, then add a jit stack) |
| 26 | + return r; |
| 27 | + }(); |
| 28 | + return *match_ctx; |
| 29 | + } |
| 30 | + std::string Regex::Impl::translate_error_code(int error_code) { |
| 31 | + std::string msg; |
| 32 | + msg.resize(120); // https://pcre2project.github.io/pcre2/doc/pcre2api/#geterrormessage says 120 chars is enough for any error message |
| 33 | + msg.resize(pcre2_get_error_message_8(error_code, reinterpret_cast<PCRE2_UCHAR8 *>(msg.data()), msg.size())); |
| 34 | + return msg; |
| 35 | + } |
22 | 36 |
|
23 | | -static re2::RE2 build_regex(std::string_view regex, Regex::flag_type flags) { |
24 | | - auto opt = translate_flags(flags); |
25 | | - if (!flags.contains(RegexFlag::Multiline) && !flags.contains(RegexFlag::RemoveWhitespace)) { |
26 | | - return {regex, opt}; |
| 37 | + TriBool Regex::Impl::apply(pcre2_code_8 &c, std::string_view str) noexcept { |
| 38 | + assert(una::is_valid_utf8(str)); |
| 39 | + match_data_ptr const m{pcre2_match_data_create_from_pattern_8(&c, nullptr)}; |
| 40 | + auto ec = pcre2_match_8(&c, reinterpret_cast<PCRE2_SPTR8>(str.data()), str.size(), 0, PCRE2_NO_UTF_CHECK, m.get(), &get_match_context()); |
| 41 | + if (ec == PCRE2_ERROR_NOMATCH) { |
| 42 | + return TriBool::False; |
| 43 | + } |
| 44 | + return ec >= 0 ? TriBool::True : TriBool::Err; |
27 | 45 | } |
28 | | - // https://www.w3.org/TR/xpath-functions/#flags |
29 | | - // re2 does not support x |
30 | | - // and m needs to be passed as re2 flag (Options::set_one_line is ignored, if Options::posix_syntax == false) |
31 | | - std::string x{}; |
32 | | - x.reserve(regex.size()+4); |
33 | | - if (flags.contains(RegexFlag::Multiline)) { |
34 | | - x.append("(?m)"); |
| 46 | + |
| 47 | + Regex::Impl::code_ptr Regex::Impl::make_code(std::string_view regex, flag_type flags, int extra_flags) { |
| 48 | + using compile_ctr_ptr = std::unique_ptr<pcre2_compile_context_8, CallFree<pcre2_compile_context_free_8>>; |
| 49 | + static auto compile_ctx = []() { |
| 50 | + compile_ctr_ptr r{pcre2_compile_context_create_8(nullptr)}; |
| 51 | + pcre2_set_max_pattern_length_8(r.get(), 1024ul * 1024 * 4); // 4 MB |
| 52 | + pcre2_set_max_pattern_compiled_length_8(r.get(), 1024ul * 1024 * 4); // 4 MB |
| 53 | + // lookbehind limit defaults to 255 |
| 54 | + // parens nest limit defaults to 250 |
| 55 | + return r; |
| 56 | + }(); |
| 57 | + |
| 58 | + assert(una::is_valid_utf8(regex)); |
| 59 | + int error_code = 0; |
| 60 | + size_t err_off = 0; |
| 61 | + int f = PCRE2_UTF | PCRE2_NO_UTF_CHECK | extra_flags; |
| 62 | + if (flags.contains(RegexFlag::DotAll)) { |
| 63 | + f |= PCRE2_DOTALL; |
| 64 | + } |
| 65 | + if (flags.contains(RegexFlag::CaseInsensitive)) { |
| 66 | + f |= PCRE2_CASELESS; |
| 67 | + } |
| 68 | + if (flags.contains(RegexFlag::Literal)) { |
| 69 | + f |= PCRE2_LITERAL; |
| 70 | + } else { |
| 71 | + f |= PCRE2_UCP | PCRE2_NEVER_BACKSLASH_C; |
| 72 | + } |
| 73 | + if (flags.contains(RegexFlag::Multiline)) { |
| 74 | + f |= PCRE2_MULTILINE; |
| 75 | + } |
| 76 | + code_ptr r{pcre2_compile_8(reinterpret_cast<PCRE2_SPTR8>(regex.data()), regex.size(), f, &error_code, &err_off, compile_ctx.get())}; |
| 77 | + if (r == nullptr) { |
| 78 | + throw RegexError{"Failed to compile regex: " + translate_error_code(error_code)}; |
| 79 | + } |
| 80 | + if (flags.contains(RegexFlag::Optimize)) { |
| 81 | + error_code = pcre2_jit_compile_8(r.get(), 0); |
| 82 | + if (error_code != 0) { |
| 83 | + throw RegexError{"Failed to compile jit regex: " + translate_error_code(error_code)}; |
| 84 | + } |
| 85 | + } |
| 86 | + return r; |
35 | 87 | } |
36 | | - if (flags.contains(RegexFlag::RemoveWhitespace)) { |
| 88 | + |
| 89 | + std::string Regex::Impl::remove_whitespace(std::string_view str) { |
| 90 | + std::string r; |
| 91 | + r.reserve(str.size()); |
37 | 92 | uint64_t classes = 0; |
38 | 93 | char prev = '\0'; |
39 | | - for (char const c : regex) { |
| 94 | + for (char const c : str) { |
40 | 95 | if (c == '[' && prev != '\\') { |
41 | 96 | ++classes; |
42 | 97 | } else if (c == ']' && prev != '\\') { |
43 | 98 | --classes; |
44 | 99 | } else if (classes == 0 && (c == '\t' || c == '\r' || c == '\n' || c == ' ')) { |
45 | 100 | continue; |
46 | 101 | } |
47 | | - x.append(1, c); |
| 102 | + r.append(1, c); |
48 | 103 | prev = c; |
49 | 104 | } |
50 | | - } else { |
51 | | - x.append(regex); |
| 105 | + return r; |
52 | 106 | } |
53 | | - return {x, opt}; |
54 | | -} |
55 | | - |
56 | | -} // namespace detail |
57 | | - |
58 | | -Regex::Impl::Impl(std::string_view const regex, Regex::flag_type const flags) : regex{detail::build_regex(regex, flags)}, flags{flags} { |
59 | | - if (!this->regex.ok()) { |
60 | | - throw RegexError{"Failed to compile regex: " + this->regex.error()}; |
| 107 | + Regex::Impl Regex::Impl::make(std::string_view regex, flag_type flags) { |
| 108 | + std::string buff = ""; |
| 109 | + if (flags.contains(RegexFlag::RemoveWhitespace)) { |
| 110 | + buff = remove_whitespace(regex); |
| 111 | + regex = buff; |
| 112 | + } |
| 113 | + auto m = make_code(regex, flags, PCRE2_ANCHORED | PCRE2_ENDANCHORED); |
| 114 | + auto s = make_code(regex, flags, 0); |
| 115 | + return {std::move(m), std::move(s), flags}; |
| 116 | + } |
| 117 | + Regex::Impl::Impl(code_ptr match, code_ptr search, flag_type flags) |
| 118 | + : match(std::move(match)), |
| 119 | + search(std::move(search)), |
| 120 | + flags(flags) { |
61 | 121 | } |
62 | | -} |
63 | | - |
64 | | -bool Regex::Impl::regex_match(std::string_view const str) const noexcept { |
65 | | - return RE2::FullMatch(str, this->regex); |
66 | | -} |
67 | | - |
68 | | -bool Regex::Impl::regex_search(std::string_view const str) const noexcept { |
69 | | - return RE2::PartialMatch(str, this->regex); |
70 | | -} |
71 | 122 |
|
72 | 123 | } //namespace rdf4cpp::regex |
0 commit comments