Skip to content

Commit af4d1a2

Browse files
mcb5637nkaralis
andauthored
Feature: RDF/XML parser (#410)
Co-authored-by: Nikolaos Karalis <nkaralis@mail.uni-paderborn.de>
1 parent e291681 commit af4d1a2

38 files changed

Lines changed: 2129 additions & 81 deletions

CMakeLists.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ find_package(highway REQUIRED)
2323
find_package(dice-hash REQUIRED)
2424
find_package(dice-sparse-map REQUIRED)
2525
find_package(dice-template-library REQUIRED)
26+
find_package(libxml2 REQUIRED)
2627

2728
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/version.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/src/rdf4cpp/version.hpp)
2829

@@ -149,6 +150,17 @@ add_library(rdf4cpp
149150
src/rdf4cpp/IRIFactory.cpp
150151
src/rdf4cpp/util/Anonymizer.cpp
151152
private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp
153+
private/rdf4cpp/parser/XMLParser.cpp
154+
private/rdf4cpp/parser/XMLParserUtility.cpp
155+
private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp
156+
private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp
157+
private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp
158+
private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp
159+
private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp
160+
private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp
161+
private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp
162+
private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp
163+
private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp
152164
private/rdf4cpp/regex/RegexImpl.cpp
153165
private/rdf4cpp/regex/RegexReplacerImpl.cpp
154166
${serd_source_files}
@@ -178,6 +190,7 @@ target_link_libraries(rdf4cpp
178190
OpenSSL::Crypto
179191
uni-algo::uni-algo
180192
highway::highway
193+
LibXml2::LibXml2
181194
)
182195

183196
set_target_properties(rdf4cpp PROPERTIES

conanfile.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,12 @@ def requirements(self):
3939
self.requires("dice-hash/0.4.11", transitive_headers=True)
4040
self.requires("dice-sparse-map/0.2.9", transitive_headers=True)
4141
self.requires("dice-template-library/1.19.0", transitive_headers=True)
42+
self.requires("libxml2/2.15.0", options={"iconv": False})
4243

4344
if self.options.with_test_deps:
4445
self.test_requires("doctest/2.4.11")
4546
self.test_requires("nanobench/4.3.11")
47+
self.test_requires("libcurl/8.12.1")
4648

4749
def set_name(self):
4850
if not hasattr(self, 'name') or self.version is None:

private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp

Lines changed: 60 additions & 65 deletions
Large diffs are not rendered by default.

private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,22 @@
1414

1515
namespace rdf4cpp::parser {
1616

17-
struct IStreamQuadIterator::Impl {
17+
struct IStreamQuadIterator::ImplSerd final : Impl {
1818
using flags_type = IStreamQuadIterator::flags_type;
1919
using state_type = IStreamQuadIterator::state_type;
2020
using ok_type = IStreamQuadIterator::ok_type;
2121
using error_type = IStreamQuadIterator::error_type;
2222

2323
private:
24-
SerdReader *reader;
24+
// workaround for gcc-14 bug, erroneously warns on unsing a lambda here
25+
// see https://github.com/NVIDIA/stdexec/issues/1143
26+
struct SerdReaderDtorLambda {
27+
void operator()(SerdReader* r) const {
28+
serd_reader_end_stream(r);
29+
serd_reader_free(r);
30+
}
31+
};
32+
std::unique_ptr<SerdReader, SerdReaderDtorLambda> reader;
2533

2634
state_type *state;
2735
bool state_is_owned;
@@ -33,11 +41,9 @@ struct IStreamQuadIterator::Impl {
3341

3442
flags_type flags;
3543

36-
private:
3744
static std::string_view node_into_string_view(SerdNode const *node) noexcept;
3845
static ParsingError::Type parsing_error_type_from_serd(SerdStatus st) noexcept;
3946

40-
private:
4147
nonstd::expected<Node, SerdStatus> get_bnode(std::string &&graph_str, SerdNode const *node) noexcept;
4248
nonstd::expected<IRI, SerdStatus> get_iri(SerdNode const *node) noexcept;
4349
nonstd::expected<IRI, SerdStatus> get_prefixed_iri(SerdNode const *node) noexcept;
@@ -63,13 +69,13 @@ struct IStreamQuadIterator::Impl {
6369
}
6470

6571
public:
66-
Impl(void *stream,
72+
ImplSerd(void *stream,
6773
ReadFunc read,
6874
ErrorFunc,
6975
flags_type flags,
7076
state_type *state) noexcept;
7177

72-
~Impl() noexcept;
78+
~ImplSerd() override;
7379

7480
/**
7581
* Tries to extract the next element from the serd backend.
@@ -81,10 +87,10 @@ struct IStreamQuadIterator::Impl {
8187
* expected Quad: if there was a next element and it could be parsed
8288
* unexpected ParsingError: if there was a next element but it could not be parsed
8389
*/
84-
[[nodiscard]] std::optional<nonstd::expected<ok_type, error_type>> next();
90+
[[nodiscard]] std::optional<nonstd::expected<ok_type, error_type>> next() override;
8591

86-
[[nodiscard]] uint64_t current_line() const noexcept;
87-
[[nodiscard]] uint64_t current_column() const noexcept;
92+
[[nodiscard]] uint64_t current_line() const noexcept override;
93+
[[nodiscard]] uint64_t current_column() const noexcept override;
8894
};
8995

9096
} // namespace rdf4cpp::parser
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#include <rdf4cpp/parser/XMLParser.hpp>
2+
3+
#include <rdf4cpp/parser/XMLParserStateTransition.hpp>
4+
5+
#include <ranges>
6+
7+
namespace rdf4cpp::parser {
8+
xmlSAXHandler IStreamQuadIterator::ImplXML::make_sax_handler() {
9+
xmlSAXHandler r{};
10+
std::memset(&r, 0, sizeof(xmlSAXHandler));
11+
r.initialized = XML_SAX2_MAGIC;
12+
r.getParameterEntity = get_entity;
13+
r.getEntity = get_entity;
14+
r.characters = on_characters;
15+
r.startElementNs = on_start_element;
16+
r.endElementNs = on_end_element;
17+
r.warning = on_error;
18+
r.error = on_error;
19+
return r;
20+
}
21+
22+
void IStreamQuadIterator::ImplXML::handle_state_transition(StateTransition transition) {
23+
dice::template_library::match(std::move(transition.modify_state),
24+
[](NoStateChange) {
25+
// noop
26+
},
27+
[this](PopState) {
28+
state_stack_.pop_back();
29+
},
30+
[this]<typename S>(S &&new_state) {
31+
state_stack_.emplace_back(std::in_place_type<S>, std::forward<S>(new_state));
32+
}
33+
);
34+
}
35+
36+
// implemented here, to have access to states
37+
bool iri_reserved(std::string_view const uri, std::string_view const local_name) {
38+
static constexpr std::array reserved = {
39+
xml_states::RDFState::start_element,
40+
xml_states::DescriptionState::id_attrib,
41+
xml_states::DescriptionState::about_attrib,
42+
xml_states::PredicateState::parse_type_attrib,
43+
xml_states::PredicateState::resource_attrib,
44+
xml_states::DescriptionState::node_id_attrib,
45+
xml_states::TypedLiteralPredicateState::datatype_attrib,
46+
xml_states::BaseState::base_attribute,
47+
xml_states::BaseState::lang_attribute,
48+
std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"),
49+
std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"),
50+
std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"),
51+
};
52+
return std::ranges::any_of(reserved, [&](std::string_view const e) {
53+
return iri_equal_pieces(e, uri, local_name);
54+
});
55+
}
56+
bool iri_core_syntax(std::string_view const uri, std::string_view const local_name) {
57+
static constexpr std::array reserved = {
58+
xml_states::RDFState::start_element,
59+
xml_states::DescriptionState::id_attrib,
60+
xml_states::DescriptionState::about_attrib,
61+
xml_states::PredicateState::parse_type_attrib,
62+
xml_states::PredicateState::resource_attrib,
63+
xml_states::DescriptionState::node_id_attrib,
64+
};
65+
return std::ranges::any_of(reserved, [&](std::string_view const e) {
66+
return iri_equal_pieces(e, uri, local_name);
67+
});
68+
}
69+
bool iri_old_term(std::string_view const uri, std::string_view const local_name) {
70+
static constexpr std::array reserved = {
71+
std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"},
72+
std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"},
73+
std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"},
74+
};
75+
return std::ranges::any_of(reserved, [&](std::string_view const e) {
76+
return iri_equal_pieces(e, uri, local_name);
77+
});
78+
}
79+
80+
void IStreamQuadIterator::ImplXML::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp)
81+
va_list args;
82+
va_list args_copy;
83+
auto t = static_cast<ImplXML *>(th);
84+
va_start(args, msg); // NOLINT(*-pro-bounds-array-to-pointer-decay)
85+
va_copy(args_copy, args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
86+
std::string out{};
87+
out.resize(1+vsnprintf(nullptr, 0, msg, args_copy), '\0'); // NOLINT(*-pro-bounds-array-to-pointer-decay)
88+
auto l = vsnprintf(out.data(), out.size(), msg, args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
89+
if (l > 0) {
90+
out.resize(l);
91+
} else {
92+
out = "unknown error, too long to fit";
93+
}
94+
t->output_.add_error(ParsingError::Type::BadSyntax, std::move(out), t->make_info());
95+
va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
96+
va_end(args_copy); // NOLINT(*-pro-bounds-array-to-pointer-decay)
97+
}
98+
xmlEntity *IStreamQuadIterator::ImplXML::get_entity(void *, xmlChar const *e) {
99+
return xmlGetPredefinedEntity(e);
100+
}
101+
void IStreamQuadIterator::ImplXML::on_characters(void *th, xmlChar const *e, int const len) {
102+
auto *t = static_cast<ImplXML *>(th);
103+
t->handle_state_transition(t->current_state().on_characters(t->output_, from_xml_char(e, len), t->make_info()));
104+
}
105+
void IStreamQuadIterator::ImplXML::on_start_element(void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri,
106+
[[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces,
107+
int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) {
108+
auto *t = static_cast<ImplXML *>(th);
109+
t->handle_state_transition(t->current_state().on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri),
110+
std::span{reinterpret_cast<XMLAttribute *>(attributes), static_cast<size_t>(n_attributes)}, t->make_info()));
111+
}
112+
void IStreamQuadIterator::ImplXML::on_end_element(void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) {
113+
auto *t = static_cast<ImplXML *>(th);
114+
t->handle_state_transition(t->current_state().on_end_element(t->output_, t->make_info()));
115+
}
116+
117+
XMLStateInfo IStreamQuadIterator::ImplXML::make_info() const {
118+
std::string_view base = "";
119+
for (auto const &s : state_stack_ | std::views::reverse) {
120+
std::string_view const v = s->base;
121+
if (!v.empty()) {
122+
base = v;
123+
break;
124+
}
125+
}
126+
127+
std::string_view lang_tag = "";
128+
for (auto const &s : state_stack_ | std::views::reverse) {
129+
std::string_view const v = s->lang_tag;
130+
if (!v.empty()) {
131+
lang_tag = v;
132+
break;
133+
}
134+
}
135+
136+
xmlChar const *data;
137+
int size = 1024;
138+
int off = 0;
139+
xmlCtxtGetInputWindow(context_.get(), 0, &data, &size, &off);
140+
std::string_view const source{reinterpret_cast<char const *>(data), static_cast<size_t>(size)};
141+
142+
return XMLStateInfo{
143+
current_line(),
144+
current_column(),
145+
base,
146+
lang_tag,
147+
source,
148+
off,
149+
};
150+
}
151+
152+
IStreamQuadIterator::ImplXML::ImplXML(void *obj, ReadFunc const read, ErrorFunc const err, EOFFunc const eof, state_type *state)
153+
: handler_(make_sax_handler()),
154+
context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "rdf/xml")),
155+
reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof),
156+
output_(state) {
157+
xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES);
158+
state_stack_.reserve(10);
159+
state_stack_.emplace_back(std::in_place_type<xml_states::InitialState>);
160+
161+
current_state().base = output_.current_base_iri();
162+
}
163+
164+
std::optional<IStreamQuadIterator::value_type> IStreamQuadIterator::ImplXML::next() {
165+
std::array<char, 8192> buffer; // NOLINT(*-pro-type-member-init)
166+
while (output_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) {
167+
auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_);
168+
xmlParseChunk(context_.get(), buffer.data(), static_cast<int>(read), eof_func_(reader_obj_) != 0);
169+
}
170+
return output_.next();
171+
}
172+
173+
uint64_t IStreamQuadIterator::ImplXML::current_line() const noexcept {
174+
return xmlSAX2GetLineNumber(context_.get());
175+
}
176+
177+
uint64_t IStreamQuadIterator::ImplXML::current_column() const noexcept {
178+
return xmlSAX2GetColumnNumber(context_.get());
179+
}
180+
} // namespace rdf4cpp::parser
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#ifndef RDF4CPP_XMLPARSER_H
2+
#define RDF4CPP_XMLPARSER_H
3+
4+
#include <rdf4cpp/Expected.hpp>
5+
#include <rdf4cpp/Quad.hpp>
6+
#include <rdf4cpp/IRIFactory.hpp>
7+
#include <rdf4cpp/parser/IStreamQuadIterator.hpp>
8+
#include <rdf4cpp/parser/XMLParserUtility.hpp>
9+
#include <rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp>
10+
#include <rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp>
11+
#include <rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp>
12+
#include <rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp>
13+
#include <rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp>
14+
#include <rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp>
15+
#include <rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp>
16+
#include <rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp>
17+
#include <rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp>
18+
19+
#include <dice/sparse-map/sparse_set.hpp>
20+
#include <dice/template-library/inplace_polymorphic.hpp>
21+
22+
#include <libxml/parser.h>
23+
24+
#include <memory>
25+
#include <vector>
26+
27+
namespace rdf4cpp::parser {
28+
struct IStreamQuadIterator::ImplXML final : Impl {
29+
private:
30+
xmlSAXHandler handler_;
31+
// workaround for gcc-14 bug, erroneously warns on unsing a lambda here
32+
// see https://github.com/NVIDIA/stdexec/issues/1143
33+
struct XmlParserCtxtDtorLambda {
34+
void operator()(xmlParserCtxt *c) const {
35+
xmlFreeParserCtxt(c);
36+
}
37+
};
38+
std::unique_ptr<xmlParserCtxt, XmlParserCtxtDtorLambda> context_;
39+
void *reader_obj_;
40+
ReadFunc read_func_;
41+
ErrorFunc error_func_;
42+
EOFFunc eof_func_;
43+
XMLOutputQueue output_;
44+
45+
using State = dice::template_library::inplace_polymorphic<xml_states::BaseState,
46+
xml_states::InitialState, xml_states::RDFState,
47+
xml_states::DescriptionState, xml_states::PredicateState,
48+
xml_states::TypedLiteralPredicateState, xml_states::EmptyElement,
49+
xml_states::XMLLiteralState, xml_states::CollectionState>;
50+
51+
std::vector<State> state_stack_; // Note: we use a vector because std::stack does not have .reserve()
52+
53+
[[nodiscard]] xml_states::BaseState const &current_state() const noexcept {
54+
return *state_stack_.back();
55+
}
56+
57+
[[nodiscard]] xml_states::BaseState &current_state() noexcept {
58+
return *state_stack_.back();
59+
}
60+
61+
static xmlSAXHandler make_sax_handler();
62+
63+
void handle_state_transition(StateTransition transition);
64+
65+
static void on_error(void *th, char const *msg, ...);
66+
static xmlEntity *get_entity(void *th, xmlChar const *e);
67+
static void on_characters(void *th, xmlChar const *e, int len);
68+
static void on_start_element(void *th, xmlChar const *local_name, xmlChar const *prefix, xmlChar const *uri,
69+
int n_namespaces, xmlChar const **namespaces,
70+
int n_attributes, int n_defaulted, xmlChar const **attributes);
71+
static void on_end_element(void *th, xmlChar const *local_name, xmlChar const *prefix, xmlChar const *uri);
72+
73+
[[nodiscard]] XMLStateInfo make_info() const;
74+
75+
public:
76+
ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state);
77+
78+
ImplXML(ImplXML const &) = delete;
79+
ImplXML &operator=(ImplXML const &) = delete;
80+
ImplXML(ImplXML &&) = delete;
81+
ImplXML &operator=(ImplXML &&) = delete;
82+
~ImplXML() override = default;
83+
84+
[[nodiscard]] std::optional<value_type> next() override;
85+
86+
[[nodiscard]] uint64_t current_line() const noexcept override;
87+
[[nodiscard]] uint64_t current_column() const noexcept override;
88+
};
89+
} // namespace rdf4cpp::parser
90+
91+
#endif //RDF4CPP_XMLPARSER_H

0 commit comments

Comments
 (0)