Skip to content

Commit 38b47ec

Browse files
authored
Feature/pcre2 (#430)
1 parent 9ca9686 commit 38b47ec

12 files changed

Lines changed: 425 additions & 184 deletions

File tree

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ endif ()
1616
# find packages
1717
find_package(expected-lite REQUIRED)
1818
find_package(Boost REQUIRED)
19-
find_package(re2 REQUIRED)
19+
find_package(PCRE2 REQUIRED)
2020
find_package(OpenSSL REQUIRED)
2121
find_package(uni-algo REQUIRED)
2222
find_package(highway REQUIRED)
@@ -186,7 +186,7 @@ target_link_libraries(rdf4cpp
186186
dice-sparse-map::dice-sparse-map
187187
dice-template-library::dice-template-library
188188
PRIVATE
189-
re2::re2
189+
pcre2::pcre2
190190
OpenSSL::Crypto
191191
uni-algo::uni-algo
192192
highway::highway

conanfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class Recipe(ConanFile):
3232
def requirements(self):
3333
self.requires("boost/1.86.0", transitive_headers=True, libs=False)
3434
self.requires("expected-lite/0.8.0", transitive_headers=True)
35-
self.requires("re2/20230301") # using latest version that does not require abseil
35+
self.requires("pcre2/10.44", options={"support_jit": True})
3636
self.requires("openssl/3.3.2")
3737
self.requires("uni-algo/1.2.0")
3838
self.requires("highway/1.2.0")
Lines changed: 99 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,123 @@
11
#include "RegexImpl.hpp"
22

3+
#include <uni_algo/conv.h>
4+
35
namespace rdf4cpp::regex {
46

5-
namespace detail {
7+
Regex::Impl::Impl(std::string_view const regex, flag_type const flags)
8+
: Impl{make(regex, flags)} {
9+
}
10+
11+
TriBool Regex::Impl::regex_match(std::string_view const str) const noexcept {
12+
return apply(*match, str);
13+
}
614

7-
/**
8-
* Translates the regex flags of rdf4cpp's public interface to RE2 Options.
9-
*
10-
* @param flags the rdf4cpp flags to translate
11-
* @return the translated RE2 Options
12-
*/
13-
static re2::RE2::Options translate_flags(Regex::flag_type const flags) {
14-
re2::RE2::Options o;
15-
o.set_log_errors(false);
16-
o.set_dot_nl(flags.contains(RegexFlag::DotAll));
17-
o.set_case_sensitive(!flags.contains(RegexFlag::CaseInsensitive));
18-
o.set_literal(flags.contains(RegexFlag::Literal));
15+
TriBool Regex::Impl::regex_search(std::string_view const str) const noexcept {
16+
return apply(*search, str);
17+
}
1918

20-
return o;
21-
}
19+
pcre2_match_context_8 &Regex::Impl::get_match_context() {
20+
using match_ctx_ptr = std::unique_ptr<pcre2_match_context_8, CallFree<pcre2_match_context_free_8>>;
21+
static auto match_ctx = []() {
22+
match_ctx_ptr r{pcre2_match_context_create_8(nullptr)};
23+
pcre2_set_heap_limit_8(r.get(), 32); // 32 KB
24+
pcre2_set_match_limit_8(r.get(), 100000);
25+
// jit stack limit is 32 KB (to change it, match_ctx needs to be thread local, then add a jit stack)
26+
return r;
27+
}();
28+
return *match_ctx;
29+
}
30+
std::string Regex::Impl::translate_error_code(int error_code) {
31+
std::string msg;
32+
msg.resize(120); // https://pcre2project.github.io/pcre2/doc/pcre2api/#geterrormessage says 120 chars is enough for any error message
33+
msg.resize(pcre2_get_error_message_8(error_code, reinterpret_cast<PCRE2_UCHAR8 *>(msg.data()), msg.size()));
34+
return msg;
35+
}
2236

23-
static re2::RE2 build_regex(std::string_view regex, Regex::flag_type flags) {
24-
auto opt = translate_flags(flags);
25-
if (!flags.contains(RegexFlag::Multiline) && !flags.contains(RegexFlag::RemoveWhitespace)) {
26-
return {regex, opt};
37+
TriBool Regex::Impl::apply(pcre2_code_8 &c, std::string_view str) noexcept {
38+
assert(una::is_valid_utf8(str));
39+
match_data_ptr const m{pcre2_match_data_create_from_pattern_8(&c, nullptr)};
40+
auto ec = pcre2_match_8(&c, reinterpret_cast<PCRE2_SPTR8>(str.data()), str.size(), 0, PCRE2_NO_UTF_CHECK, m.get(), &get_match_context());
41+
if (ec == PCRE2_ERROR_NOMATCH) {
42+
return TriBool::False;
43+
}
44+
return ec >= 0 ? TriBool::True : TriBool::Err;
2745
}
28-
// https://www.w3.org/TR/xpath-functions/#flags
29-
// re2 does not support x
30-
// and m needs to be passed as re2 flag (Options::set_one_line is ignored, if Options::posix_syntax == false)
31-
std::string x{};
32-
x.reserve(regex.size()+4);
33-
if (flags.contains(RegexFlag::Multiline)) {
34-
x.append("(?m)");
46+
47+
Regex::Impl::code_ptr Regex::Impl::make_code(std::string_view regex, flag_type flags, int extra_flags) {
48+
using compile_ctr_ptr = std::unique_ptr<pcre2_compile_context_8, CallFree<pcre2_compile_context_free_8>>;
49+
static auto compile_ctx = []() {
50+
compile_ctr_ptr r{pcre2_compile_context_create_8(nullptr)};
51+
pcre2_set_max_pattern_length_8(r.get(), 1024ul * 1024 * 4); // 4 MB
52+
pcre2_set_max_pattern_compiled_length_8(r.get(), 1024ul * 1024 * 4); // 4 MB
53+
// lookbehind limit defaults to 255
54+
// parens nest limit defaults to 250
55+
return r;
56+
}();
57+
58+
assert(una::is_valid_utf8(regex));
59+
int error_code = 0;
60+
size_t err_off = 0;
61+
int f = PCRE2_UTF | PCRE2_NO_UTF_CHECK | extra_flags;
62+
if (flags.contains(RegexFlag::DotAll)) {
63+
f |= PCRE2_DOTALL;
64+
}
65+
if (flags.contains(RegexFlag::CaseInsensitive)) {
66+
f |= PCRE2_CASELESS;
67+
}
68+
if (flags.contains(RegexFlag::Literal)) {
69+
f |= PCRE2_LITERAL;
70+
} else {
71+
f |= PCRE2_UCP | PCRE2_NEVER_BACKSLASH_C;
72+
}
73+
if (flags.contains(RegexFlag::Multiline)) {
74+
f |= PCRE2_MULTILINE;
75+
}
76+
code_ptr r{pcre2_compile_8(reinterpret_cast<PCRE2_SPTR8>(regex.data()), regex.size(), f, &error_code, &err_off, compile_ctx.get())};
77+
if (r == nullptr) {
78+
throw RegexError{"Failed to compile regex: " + translate_error_code(error_code)};
79+
}
80+
if (flags.contains(RegexFlag::Optimize)) {
81+
error_code = pcre2_jit_compile_8(r.get(), 0);
82+
if (error_code != 0) {
83+
throw RegexError{"Failed to compile jit regex: " + translate_error_code(error_code)};
84+
}
85+
}
86+
return r;
3587
}
36-
if (flags.contains(RegexFlag::RemoveWhitespace)) {
88+
89+
std::string Regex::Impl::remove_whitespace(std::string_view str) {
90+
std::string r;
91+
r.reserve(str.size());
3792
uint64_t classes = 0;
3893
char prev = '\0';
39-
for (char const c : regex) {
94+
for (char const c : str) {
4095
if (c == '[' && prev != '\\') {
4196
++classes;
4297
} else if (c == ']' && prev != '\\') {
4398
--classes;
4499
} else if (classes == 0 && (c == '\t' || c == '\r' || c == '\n' || c == ' ')) {
45100
continue;
46101
}
47-
x.append(1, c);
102+
r.append(1, c);
48103
prev = c;
49104
}
50-
} else {
51-
x.append(regex);
105+
return r;
52106
}
53-
return {x, opt};
54-
}
55-
56-
} // namespace detail
57-
58-
Regex::Impl::Impl(std::string_view const regex, Regex::flag_type const flags) : regex{detail::build_regex(regex, flags)}, flags{flags} {
59-
if (!this->regex.ok()) {
60-
throw RegexError{"Failed to compile regex: " + this->regex.error()};
107+
Regex::Impl Regex::Impl::make(std::string_view regex, flag_type flags) {
108+
std::string buff = "";
109+
if (flags.contains(RegexFlag::RemoveWhitespace)) {
110+
buff = remove_whitespace(regex);
111+
regex = buff;
112+
}
113+
auto m = make_code(regex, flags, PCRE2_ANCHORED | PCRE2_ENDANCHORED);
114+
auto s = make_code(regex, flags, 0);
115+
return {std::move(m), std::move(s), flags};
116+
}
117+
Regex::Impl::Impl(code_ptr match, code_ptr search, flag_type flags)
118+
: match(std::move(match)),
119+
search(std::move(search)),
120+
flags(flags) {
61121
}
62-
}
63-
64-
bool Regex::Impl::regex_match(std::string_view const str) const noexcept {
65-
return RE2::FullMatch(str, this->regex);
66-
}
67-
68-
bool Regex::Impl::regex_search(std::string_view const str) const noexcept {
69-
return RE2::PartialMatch(str, this->regex);
70-
}
71122

72123
} //namespace rdf4cpp::regex

private/rdf4cpp/regex/RegexImpl.hpp

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,43 @@
33

44
#include <rdf4cpp/regex/Regex.hpp>
55

6-
#include <re2/re2.h>
6+
#define PCRE2_CODE_UNIT_WIDTH 8
7+
#include <pcre2.h>
78

89
namespace rdf4cpp::regex {
10+
struct Regex::Impl {
11+
private:
12+
// workaround for gcc-14 bug, erroneously warns on unsing a lambda here
13+
// see https://github.com/NVIDIA/stdexec/issues/1143
14+
template<auto f>
15+
struct CallFree {
16+
void operator()(auto* c) {
17+
f(c);
18+
}
19+
};
920

10-
struct Regex::Impl {
11-
re2::RE2 regex;
12-
Regex::flag_type flags;
21+
using code_ptr = std::unique_ptr<pcre2_code_8, CallFree<pcre2_code_free_8>>;
1322

14-
Impl(std::string_view regex, Regex::flag_type flags);
15-
[[nodiscard]] bool regex_match(std::string_view str) const noexcept;
16-
[[nodiscard]] bool regex_search(std::string_view str) const noexcept;
17-
};
23+
public:
24+
code_ptr match;
25+
code_ptr search;
26+
Regex::flag_type flags;
1827

28+
Impl(std::string_view regex, flag_type flags);
29+
[[nodiscard]] TriBool regex_match(std::string_view str) const noexcept;
30+
[[nodiscard]] TriBool regex_search(std::string_view str) const noexcept;
31+
[[nodiscard]] static pcre2_match_context_8& get_match_context();
32+
[[nodiscard]] static std::string translate_error_code(int error_code);
33+
34+
private:
35+
using match_data_ptr = std::unique_ptr<pcre2_match_data_8, CallFree<pcre2_match_data_free_8>>;
36+
[[nodiscard]] static TriBool apply(pcre2_code_8 &c, std::string_view str) noexcept;
37+
38+
static code_ptr make_code(std::string_view regex, flag_type flags, int extra_flags);
39+
static std::string remove_whitespace(std::string_view str);
40+
static Impl make(std::string_view regex, flag_type flags);
41+
Impl(code_ptr match, code_ptr search, flag_type flags);
42+
};
1943
} //namespace rdf4cpp::regex
2044

2145
#endif //RDF4CPP_RDF_UTIL_PRIVATE_REGEX_IMPL_HPP

0 commit comments

Comments
 (0)