Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
e6f48d1
basic xml parsing
mcb5637 Nov 6, 2025
2c92962
api, remove uneccesary graph
mcb5637 Nov 6, 2025
105e65d
fix sanitizer
mcb5637 Nov 7, 2025
24b3c03
cleanup
mcb5637 Nov 7, 2025
4c1b52e
entities
mcb5637 Nov 12, 2025
e518b99
more syntax+tests
mcb5637 Nov 12, 2025
e112429
base scoping
mcb5637 Nov 12, 2025
d040b2e
more tests
mcb5637 Nov 12, 2025
cfd7664
parsing state
mcb5637 Nov 13, 2025
db888f8
triples in properties
mcb5637 Nov 13, 2025
55d2cc9
more tests
mcb5637 Nov 13, 2025
ede429f
wip xml literal
mcb5637 Nov 14, 2025
ea0a1ae
cleanup
mcb5637 Nov 14, 2025
74a2538
collection
mcb5637 Nov 14, 2025
48e63c6
reifycation
mcb5637 Nov 19, 2025
97327ed
lang tag
mcb5637 Nov 19, 2025
b74a806
some cleanup
mcb5637 Nov 19, 2025
fb86ba8
list
mcb5637 Nov 20, 2025
1f7f0cc
more cleanup
mcb5637 Nov 20, 2025
c1eebb0
one parser to rule them all
mcb5637 Nov 21, 2025
0098e62
try fix gcc error
mcb5637 Nov 21, 2025
9251a7f
another gcc fix
mcb5637 Nov 21, 2025
48adc71
cleanup
mcb5637 Nov 26, 2025
df06c32
Merge branch 'develop' into feature/parse_xml
mcb5637 Nov 26, 2025
a2dbc6f
initial base
mcb5637 Nov 26, 2025
dfd809c
xmlliteral tests, cleanup
mcb5637 Nov 27, 2025
d64130a
fixes
mcb5637 Nov 28, 2025
5e8bac8
inplace poly
mcb5637 Nov 28, 2025
40c0f5a
doc
mcb5637 Nov 28, 2025
7f4261f
reorganize
mcb5637 Dec 3, 2025
c40d4dd
separate states
mcb5637 Dec 3, 2025
5ca0c2f
fix gcc14 bug again
mcb5637 Dec 3, 2025
efa4374
Merge branch 'develop' into feature/parse_xml
mcb5637 Dec 4, 2025
6cf39d5
remove iconv
mcb5637 Dec 4, 2025
f8f6454
states no longer get a Impl&
mcb5637 Dec 9, 2025
7417925
reorganize classes
mcb5637 Dec 11, 2025
5682b8c
prepare for tests
mcb5637 Dec 11, 2025
286815a
remove self-referentialness of struct and minor cleanup
liss-h Dec 17, 2025
88d9391
review
mcb5637 Jan 9, 2026
dd965a6
Merge remote-tracking branch 'origin/feature/parse_xml' into feature/…
mcb5637 Jan 9, 2026
e23492e
fix merge
mcb5637 Jan 9, 2026
71638ea
tests from rdf-tests repo
nkaralis Jan 21, 2026
95d11d6
added missing test dep
nkaralis Jan 21, 2026
4c26db9
tests
mcb5637 Jan 21, 2026
fc3e9ad
review
mcb5637 Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ find_package(highway REQUIRED)
find_package(dice-hash REQUIRED)
find_package(dice-sparse-map REQUIRED)
find_package(dice-template-library REQUIRED)
find_package(libxml2 REQUIRED)

configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/version.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/src/rdf4cpp/version.hpp)

Expand Down Expand Up @@ -149,6 +150,17 @@ add_library(rdf4cpp
src/rdf4cpp/IRIFactory.cpp
src/rdf4cpp/util/Anonymizer.cpp
private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp
private/rdf4cpp/parser/XMLParser.cpp
private/rdf4cpp/parser/XMLParserUtility.cpp
private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp
private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp
private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp
private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp
private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp
private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp
private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp
private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp
private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp
private/rdf4cpp/regex/RegexImpl.cpp
private/rdf4cpp/regex/RegexReplacerImpl.cpp
${serd_source_files}
Expand Down Expand Up @@ -178,6 +190,7 @@ target_link_libraries(rdf4cpp
OpenSSL::Crypto
uni-algo::uni-algo
highway::highway
LibXml2::LibXml2
)

set_target_properties(rdf4cpp PROPERTIES
Expand Down
2 changes: 2 additions & 0 deletions conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@ def requirements(self):
self.requires("dice-hash/0.4.11", transitive_headers=True)
self.requires("dice-sparse-map/0.2.9", transitive_headers=True)
self.requires("dice-template-library/1.19.0", transitive_headers=True)
self.requires("libxml2/2.15.0", options={"iconv": False})
Comment thread
liss-h marked this conversation as resolved.

if self.options.with_test_deps:
self.test_requires("doctest/2.4.11")
self.test_requires("nanobench/4.3.11")
self.test_requires("libcurl/8.12.1")

def set_name(self):
if not hasattr(self, 'name') or self.version is None:
Expand Down
125 changes: 60 additions & 65 deletions private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp

Large diffs are not rendered by default.

24 changes: 15 additions & 9 deletions private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,22 @@

namespace rdf4cpp::parser {

struct IStreamQuadIterator::Impl {
struct IStreamQuadIterator::ImplSerd final : Impl {
using flags_type = IStreamQuadIterator::flags_type;
using state_type = IStreamQuadIterator::state_type;
using ok_type = IStreamQuadIterator::ok_type;
using error_type = IStreamQuadIterator::error_type;

private:
SerdReader *reader;
// workaround for gcc-14 bug, erroneously warns on unsing a lambda here
// see https://github.com/NVIDIA/stdexec/issues/1143
struct SerdReaderDtorLambda {
void operator()(SerdReader* r) const {
serd_reader_end_stream(r);
serd_reader_free(r);
}
};
std::unique_ptr<SerdReader, SerdReaderDtorLambda> reader;

state_type *state;
bool state_is_owned;
Expand All @@ -33,11 +41,9 @@ struct IStreamQuadIterator::Impl {

flags_type flags;

private:
static std::string_view node_into_string_view(SerdNode const *node) noexcept;
static ParsingError::Type parsing_error_type_from_serd(SerdStatus st) noexcept;

private:
nonstd::expected<Node, SerdStatus> get_bnode(std::string &&graph_str, SerdNode const *node) noexcept;
nonstd::expected<IRI, SerdStatus> get_iri(SerdNode const *node) noexcept;
nonstd::expected<IRI, SerdStatus> get_prefixed_iri(SerdNode const *node) noexcept;
Expand All @@ -63,13 +69,13 @@ struct IStreamQuadIterator::Impl {
}

public:
Impl(void *stream,
ImplSerd(void *stream,
ReadFunc read,
ErrorFunc,
flags_type flags,
state_type *state) noexcept;

~Impl() noexcept;
~ImplSerd() override;

/**
* Tries to extract the next element from the serd backend.
Expand All @@ -81,10 +87,10 @@ struct IStreamQuadIterator::Impl {
* expected Quad: if there was a next element and it could be parsed
* unexpected ParsingError: if there was a next element but it could not be parsed
*/
[[nodiscard]] std::optional<nonstd::expected<ok_type, error_type>> next();
[[nodiscard]] std::optional<nonstd::expected<ok_type, error_type>> next() override;

[[nodiscard]] uint64_t current_line() const noexcept;
[[nodiscard]] uint64_t current_column() const noexcept;
[[nodiscard]] uint64_t current_line() const noexcept override;
[[nodiscard]] uint64_t current_column() const noexcept override;
};

} // namespace rdf4cpp::parser
Expand Down
180 changes: 180 additions & 0 deletions private/rdf4cpp/parser/XMLParser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#include <rdf4cpp/parser/XMLParser.hpp>

#include <rdf4cpp/parser/XMLParserStateTransition.hpp>

#include <ranges>

namespace rdf4cpp::parser {
xmlSAXHandler IStreamQuadIterator::ImplXML::make_sax_handler() {
xmlSAXHandler r{};
std::memset(&r, 0, sizeof(xmlSAXHandler));
r.initialized = XML_SAX2_MAGIC;
r.getParameterEntity = get_entity;
r.getEntity = get_entity;
r.characters = on_characters;
r.startElementNs = on_start_element;
r.endElementNs = on_end_element;
r.warning = on_error;
r.error = on_error;
return r;
}

void IStreamQuadIterator::ImplXML::handle_state_transition(StateTransition transition) {
dice::template_library::match(std::move(transition.modify_state),
[](NoStateChange) {
// noop
},
[this](PopState) {
state_stack_.pop_back();
},
[this]<typename S>(S &&new_state) {
state_stack_.emplace_back(std::in_place_type<S>, std::forward<S>(new_state));
}
);
}

// implemented here, to have access to states
bool iri_reserved(std::string_view const uri, std::string_view const local_name) {
static constexpr std::array reserved = {
xml_states::RDFState::start_element,
xml_states::DescriptionState::id_attrib,
xml_states::DescriptionState::about_attrib,
xml_states::PredicateState::parse_type_attrib,
xml_states::PredicateState::resource_attrib,
xml_states::DescriptionState::node_id_attrib,
xml_states::TypedLiteralPredicateState::datatype_attrib,
xml_states::BaseState::base_attribute,
xml_states::BaseState::lang_attribute,
std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"),
std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"),
std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"),
};
return std::ranges::any_of(reserved, [&](std::string_view const e) {
return iri_equal_pieces(e, uri, local_name);
});
}
bool iri_core_syntax(std::string_view const uri, std::string_view const local_name) {
static constexpr std::array reserved = {
xml_states::RDFState::start_element,
xml_states::DescriptionState::id_attrib,
xml_states::DescriptionState::about_attrib,
xml_states::PredicateState::parse_type_attrib,
xml_states::PredicateState::resource_attrib,
xml_states::DescriptionState::node_id_attrib,
};
return std::ranges::any_of(reserved, [&](std::string_view const e) {
return iri_equal_pieces(e, uri, local_name);
});
}
bool iri_old_term(std::string_view const uri, std::string_view const local_name) {
static constexpr std::array reserved = {
std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"},
std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"},
std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"},
};
return std::ranges::any_of(reserved, [&](std::string_view const e) {
return iri_equal_pieces(e, uri, local_name);
});
}

void IStreamQuadIterator::ImplXML::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp)
va_list args;
va_list args_copy;
auto t = static_cast<ImplXML *>(th);
va_start(args, msg); // NOLINT(*-pro-bounds-array-to-pointer-decay)
va_copy(args_copy, args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
std::string out{};
out.resize(1+vsnprintf(nullptr, 0, msg, args_copy), '\0'); // NOLINT(*-pro-bounds-array-to-pointer-decay)
auto l = vsnprintf(out.data(), out.size(), msg, args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
Comment thread
mcb5637 marked this conversation as resolved.
if (l > 0) {
out.resize(l);
} else {
out = "unknown error, too long to fit";
}
t->output_.add_error(ParsingError::Type::BadSyntax, std::move(out), t->make_info());
va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
va_end(args_copy); // NOLINT(*-pro-bounds-array-to-pointer-decay)
}
xmlEntity *IStreamQuadIterator::ImplXML::get_entity(void *, xmlChar const *e) {
return xmlGetPredefinedEntity(e);
}
void IStreamQuadIterator::ImplXML::on_characters(void *th, xmlChar const *e, int const len) {
auto *t = static_cast<ImplXML *>(th);
t->handle_state_transition(t->current_state().on_characters(t->output_, from_xml_char(e, len), t->make_info()));
}
void IStreamQuadIterator::ImplXML::on_start_element(void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri,
[[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces,
int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) {
auto *t = static_cast<ImplXML *>(th);
t->handle_state_transition(t->current_state().on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri),
std::span{reinterpret_cast<XMLAttribute *>(attributes), static_cast<size_t>(n_attributes)}, t->make_info()));
}
void IStreamQuadIterator::ImplXML::on_end_element(void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) {
auto *t = static_cast<ImplXML *>(th);
t->handle_state_transition(t->current_state().on_end_element(t->output_, t->make_info()));
}

XMLStateInfo IStreamQuadIterator::ImplXML::make_info() const {
std::string_view base = "";
for (auto const &s : state_stack_ | std::views::reverse) {
std::string_view const v = s->base;
if (!v.empty()) {
base = v;
break;
}
}

std::string_view lang_tag = "";
for (auto const &s : state_stack_ | std::views::reverse) {
std::string_view const v = s->lang_tag;
if (!v.empty()) {
lang_tag = v;
break;
}
}

xmlChar const *data;
int size = 1024;
int off = 0;
xmlCtxtGetInputWindow(context_.get(), 0, &data, &size, &off);
std::string_view const source{reinterpret_cast<char const *>(data), static_cast<size_t>(size)};

return XMLStateInfo{
current_line(),
current_column(),
base,
lang_tag,
source,
off,
};
}

IStreamQuadIterator::ImplXML::ImplXML(void *obj, ReadFunc const read, ErrorFunc const err, EOFFunc const eof, state_type *state)
: handler_(make_sax_handler()),
context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "rdf/xml")),
reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof),
output_(state) {
xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES);
state_stack_.reserve(10);
state_stack_.emplace_back(std::in_place_type<xml_states::InitialState>);

current_state().base = output_.current_base_iri();
}

std::optional<IStreamQuadIterator::value_type> IStreamQuadIterator::ImplXML::next() {
std::array<char, 8192> buffer; // NOLINT(*-pro-type-member-init)
while (output_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) {
auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_);
xmlParseChunk(context_.get(), buffer.data(), static_cast<int>(read), eof_func_(reader_obj_) != 0);
}
return output_.next();
}

uint64_t IStreamQuadIterator::ImplXML::current_line() const noexcept {
return xmlSAX2GetLineNumber(context_.get());
}

uint64_t IStreamQuadIterator::ImplXML::current_column() const noexcept {
return xmlSAX2GetColumnNumber(context_.get());
}
} // namespace rdf4cpp::parser
91 changes: 91 additions & 0 deletions private/rdf4cpp/parser/XMLParser.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#ifndef RDF4CPP_XMLPARSER_H
#define RDF4CPP_XMLPARSER_H

#include <rdf4cpp/Expected.hpp>
#include <rdf4cpp/Quad.hpp>
#include <rdf4cpp/IRIFactory.hpp>
#include <rdf4cpp/parser/IStreamQuadIterator.hpp>
#include <rdf4cpp/parser/XMLParserUtility.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp>
#include <rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp>

#include <dice/sparse-map/sparse_set.hpp>
#include <dice/template-library/inplace_polymorphic.hpp>

#include <libxml/parser.h>

#include <memory>
#include <vector>

namespace rdf4cpp::parser {
struct IStreamQuadIterator::ImplXML final : Impl {
private:
xmlSAXHandler handler_;
// workaround for gcc-14 bug, erroneously warns on unsing a lambda here
// see https://github.com/NVIDIA/stdexec/issues/1143
struct XmlParserCtxtDtorLambda {
void operator()(xmlParserCtxt *c) const {
xmlFreeParserCtxt(c);
}
};
std::unique_ptr<xmlParserCtxt, XmlParserCtxtDtorLambda> context_;
void *reader_obj_;
ReadFunc read_func_;
ErrorFunc error_func_;
EOFFunc eof_func_;
XMLOutputQueue output_;

using State = dice::template_library::inplace_polymorphic<xml_states::BaseState,
xml_states::InitialState, xml_states::RDFState,
xml_states::DescriptionState, xml_states::PredicateState,
xml_states::TypedLiteralPredicateState, xml_states::EmptyElement,
xml_states::XMLLiteralState, xml_states::CollectionState>;

std::vector<State> state_stack_; // Note: we use a vector because std::stack does not have .reserve()

[[nodiscard]] xml_states::BaseState const &current_state() const noexcept {
return *state_stack_.back();
}

[[nodiscard]] xml_states::BaseState &current_state() noexcept {
return *state_stack_.back();
}

static xmlSAXHandler make_sax_handler();

void handle_state_transition(StateTransition transition);

static void on_error(void *th, char const *msg, ...);
static xmlEntity *get_entity(void *th, xmlChar const *e);
static void on_characters(void *th, xmlChar const *e, int len);
static void on_start_element(void *th, xmlChar const *local_name, xmlChar const *prefix, xmlChar const *uri,
int n_namespaces, xmlChar const **namespaces,
int n_attributes, int n_defaulted, xmlChar const **attributes);
static void on_end_element(void *th, xmlChar const *local_name, xmlChar const *prefix, xmlChar const *uri);

[[nodiscard]] XMLStateInfo make_info() const;

public:
ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state);

ImplXML(ImplXML const &) = delete;
ImplXML &operator=(ImplXML const &) = delete;
ImplXML(ImplXML &&) = delete;
ImplXML &operator=(ImplXML &&) = delete;
~ImplXML() override = default;

[[nodiscard]] std::optional<value_type> next() override;

[[nodiscard]] uint64_t current_line() const noexcept override;
[[nodiscard]] uint64_t current_column() const noexcept override;
};
} // namespace rdf4cpp::parser

#endif //RDF4CPP_XMLPARSER_H
Loading
Loading