From e6f48d1f508ba35096e450b73ed9be71d1fa00e0 Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 6 Nov 2025 16:12:25 +0100 Subject: [PATCH 01/42] basic xml parsing --- CMakeLists.txt | 3 + conanfile.py | 2 + src/rdf4cpp/parser/XMLParser.cpp | 431 +++++++++++++++++++++++++++++++ src/rdf4cpp/parser/XMLParser.hpp | 55 ++++ tests/CMakeLists.txt | 7 + tests/parser/tests_XMLParser.cpp | 118 +++++++++ 6 files changed, 616 insertions(+) create mode 100644 src/rdf4cpp/parser/XMLParser.cpp create mode 100644 src/rdf4cpp/parser/XMLParser.hpp create mode 100644 tests/parser/tests_XMLParser.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 73af644d..4ebeb1e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ find_package(highway REQUIRED) find_package(dice-hash REQUIRED) find_package(dice-sparse-map REQUIRED) find_package(dice-template-library REQUIRED) +find_package(libxml2 REQUIRED) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/version.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/src/rdf4cpp/version.hpp) @@ -130,6 +131,7 @@ add_library(rdf4cpp src/rdf4cpp/namespaces/RDF.cpp src/rdf4cpp/parser/IStreamQuadIterator.cpp src/rdf4cpp/parser/RDFFileParser.cpp + src/rdf4cpp/parser/XMLParser.cpp src/rdf4cpp/query/QuadPattern.cpp src/rdf4cpp/query/Solution.cpp src/rdf4cpp/query/TriplePattern.cpp @@ -177,6 +179,7 @@ target_link_libraries(rdf4cpp OpenSSL::Crypto uni-algo::uni-algo highway::highway + LibXml2::LibXml2 ) set_target_properties(rdf4cpp PROPERTIES diff --git a/conanfile.py b/conanfile.py index ab8ddcbe..3d766ff6 100644 --- a/conanfile.py +++ b/conanfile.py @@ -39,6 +39,8 @@ def requirements(self): self.requires("dice-hash/0.4.11", transitive_headers=True) self.requires("dice-sparse-map/0.2.9", transitive_headers=True) self.requires("dice-template-library/1.13.0", transitive_headers=True) + self.requires("libxml2/2.14.5") + self.requires("zlib/1.3.1", force=True) if self.options.with_test_deps: self.test_requires("doctest/2.4.11") diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp new file mode 100644 index 00000000..dba409e7 --- /dev/null +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -0,0 +1,431 @@ +#include "XMLParser.hpp" + +#include +#include + +namespace rdf4cpp::parser { + struct XMLQuadIterator::Impl { + private: + xmlSAXHandler handler_; + std::unique_ptr context_; + std::istream &stream_; // NOLINT(*-avoid-const-or-ref-data-members) + std::deque result_queue_; + size_t next_bn_index_ = 0; + + struct Attribute { + xmlChar const *local_name_raw; + xmlChar const *prefix_raw; + xmlChar const *uri_raw; + xmlChar const *value_start_raw; + xmlChar const *value_end_raw; + + [[nodiscard]] std::string_view value() const { + return {reinterpret_cast(value_start_raw), reinterpret_cast(value_end_raw)}; + } + [[nodiscard]] std::string_view local_name() const { + return {reinterpret_cast(local_name_raw)}; + } + [[nodiscard]] std::string_view uri() const { + return {reinterpret_cast(uri_raw)}; + } + }; + + struct BaseState { // NOLINT(*-special-member-functions) + virtual ~BaseState() = default; + virtual void on_characters(Impl *impl, std::string_view chars) = 0; + virtual void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; + virtual void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) = 0; // TODO remove params? + virtual void re_enter([[maybe_unused]] Impl *impl, [[maybe_unused]] Node obj) { // TODO remove if not needed for something + } + }; + + struct EmptyState final : BaseState { + void on_characters(Impl *impl, std::string_view chars) override; + void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + }; + struct RDFState final : BaseState { + void on_characters(Impl *impl, std::string_view chars) override; + void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; + }; + struct DescriptionState final : BaseState { + void on_characters(Impl *impl, std::string_view chars) override; + void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + Node subject; + + explicit DescriptionState(Node sub) + : subject(sub) { + } + + static Node try_enter(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes); + + static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; + static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; + }; + struct PredicateState : BaseState { + void on_characters(Impl *impl, std::string_view chars) override; + void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + Node subject; + IRI predicate; + std::string literal; + bool done = false; + + PredicateState(Node sub, IRI predicate) + : subject(sub), predicate(predicate) { + } + + static constexpr std::string_view resource_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource"; + }; + struct TypedLiteralPredicateState final : PredicateState { + void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + IRI datatype; + + TypedLiteralPredicateState(Node iri, IRI predicate, IRI datatype) + : PredicateState(iri, predicate), datatype(datatype) { + } + + static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; + }; + + struct EmptyElement final : BaseState { + void on_characters(Impl *impl, std::string_view chars) override; + void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + }; + + BaseState *current_state_ = nullptr; + std::vector> state_stack_; + + static xmlSAXHandler make_sax_handler(); + + void add_error(ParsingError::Type ty, std::string msg); + /** + * add statement to the output list, if none of the components is null + * (null is used to track an already inserted parse error for that component) + * @param subject + * @param predicate + * @param object + * @param graph + */ + void add_statement(Node subject, IRI predicate, Node object, std::optional graph = std::nullopt); + void update_current_state(); + void pop_state(Node object); + static std::string_view trim(std::string_view v); + static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); + IRI try_make_iri(std::string_view iri); + IRI try_make_iri(std::string_view uri, std::string_view local_name); + BlankNode make_bn(); + + static void on_error(void *th, char const *msg, ...); + + public: + explicit Impl(std::istream &stream); + + std::optional next(); + }; + + xmlSAXHandler XMLQuadIterator::Impl::make_sax_handler() { + xmlSAXHandler r{}; + std::memset(&r, 0, sizeof(xmlSAXHandler)); + r.initialized = XML_SAX2_MAGIC; + r.getEntity = [](void *, xmlChar const *e) { + return xmlGetPredefinedEntity(e); + }; + r.characters = [](void *th, xmlChar const *e, int len) { + auto *t = static_cast(th); + t->current_state_->on_characters(t, std::string_view(reinterpret_cast(e), static_cast(len))); + }; + r.startElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, + [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, + int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { + auto *t = static_cast(th); + t->current_state_->on_start_element(t, reinterpret_cast(local_name), reinterpret_cast(uri), + std::span{reinterpret_cast(attributes), static_cast(n_attributes)}); + }; + r.endElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri) { + auto *t = static_cast(th); + t->current_state_->on_end_element(t, reinterpret_cast(local_name), reinterpret_cast(uri)); + }; + r.warning = on_error; + r.error = on_error; + return r; + } + + void XMLQuadIterator::Impl::add_error(ParsingError::Type const ty, std::string msg) { + uint64_t const lin = xmlSAX2GetLineNumber(context_.get()); + uint64_t const col = xmlSAX2GetColumnNumber(context_.get()); + result_queue_.emplace_back(nonstd::unexpect, ty, lin, col, std::move(msg)); + } + void XMLQuadIterator::Impl::add_statement(Node subject, IRI predicate, Node object, std::optional graph) { + if (subject.null() || predicate.null() || object.null()) { + return; + } + if (graph.has_value()) { + if (graph->null()) { + return; + } + result_queue_.emplace_back(Quad(*graph, subject, predicate, object)); + } else { + result_queue_.emplace_back(Quad(subject, predicate, object)); + } + } + void XMLQuadIterator::Impl::update_current_state() { + if (state_stack_.empty()) { + current_state_ = nullptr; + } + current_state_ = std::visit([](auto &s) -> BaseState * { return &s; }, state_stack_.back()); + } + void XMLQuadIterator::Impl::pop_state(Node object) { + assert(!state_stack_.empty()); + state_stack_.pop_back(); + update_current_state(); + current_state_->re_enter(this, object); + } + std::string_view XMLQuadIterator::Impl::trim(std::string_view v) { + auto s = v.find_first_not_of(" \t\r\n"); + if (s == std::string_view::npos) { + return ""; + } + v.remove_prefix(s); + // ReSharper disable once CppDFALocalValueEscapesFunction + return v; + } + bool XMLQuadIterator::Impl::iri_equal_pieces(std::string_view const full_iri, std::string_view const uri, std::string_view const local_name) { + if (full_iri.size() != local_name.size() + uri.size()) { + return false; + } + return full_iri.starts_with(uri) && full_iri.ends_with(local_name); + } + IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const iri) { + auto exp = IRIFactory::create_and_validate(iri); + if (exp.has_value()) { + return *exp; + } else { + add_error(ParsingError::Type::BadIri, std::format("{}: {}", iri, exp.error())); + return IRI::make_null(); + } + } + IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const uri, std::string_view const local_name) { + std::string iri{uri}; + iri.append(local_name); + return try_make_iri(iri); + } + BlankNode XMLQuadIterator::Impl::make_bn() { + return BlankNode::make_unchecked(std::format("bn_{}", next_bn_index_++)); + } + void XMLQuadIterator::Impl::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) + va_list args; + auto t = static_cast(th); + va_start(args, msg); // NOLINT(*-pro-bounds-array-to-pointer-decay) + std::string out{}; + out.resize(1024, '\0'); + auto l = vsnprintf(out.data(), out.size(), msg, args); // NOLINT(*-pro-bounds-array-to-pointer-decay) + if (l > 0) + { + out.resize(l); + } else { + out = "unknown error, too long to fit"; + } + t->add_error(ParsingError::Type::BadSyntax, std::move(out)); + va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) + } + + void XMLQuadIterator::Impl::EmptyState::on_characters(Impl *impl, std::string_view const chars) { + if (!trim(chars).empty()) { + impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); + } + } + void XMLQuadIterator::Impl::EmptyState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { + if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { + impl->state_stack_.emplace_back(std::in_place_type_t{}); + impl->update_current_state(); + return; + } + impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); + } + void XMLQuadIterator::Impl::EmptyState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); + } + + void XMLQuadIterator::Impl::RDFState::on_characters(Impl *impl, std::string_view const chars) { + if (!trim(chars).empty()) { + impl->add_error(ParsingError::Type::BadSyntax, "expected Description, found characters"); + } + } + void XMLQuadIterator::Impl::RDFState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + if (!DescriptionState::try_enter(impl, local_name, uri, attributes).null()) { + return; + } + impl->add_error(ParsingError::Type::BadSyntax, "expected Description, found ???"); + } + void XMLQuadIterator::Impl::RDFState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + impl->pop_state(Node::make_null()); + } + + void XMLQuadIterator::Impl::DescriptionState::on_characters(Impl *impl, std::string_view const chars) { + if (!trim(chars).empty()) { + impl->add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters"); + } + } + void XMLQuadIterator::Impl::DescriptionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { + std::string s{uri}; + s.append(local_name); + auto predicate = impl->try_make_iri(s); + std::optional datatype = std::nullopt; + std::optional ref = std::nullopt; + for (auto const &att : attributes) { + if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { + datatype = impl->try_make_iri(att.value()); + } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { + ref = impl->try_make_iri(att.value()); + } + } + if (datatype.has_value()) { + impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, *datatype); + } else if (ref.has_value()) { + impl->add_statement(subject, predicate, *ref); + impl->state_stack_.emplace_back(std::in_place_type_t{}); + } else { + impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); + } + impl->update_current_state(); + } + void XMLQuadIterator::Impl::DescriptionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + impl->pop_state(subject); + } + Node XMLQuadIterator::Impl::DescriptionState::try_enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + if (iri_equal_pieces(start_element, uri, local_name)) { + for (auto const &att : attributes) { + if (iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { + IRI iri = impl->try_make_iri(att.value()); + impl->state_stack_.emplace_back(std::in_place_type_t{}, iri); + impl->update_current_state(); + return iri; + } + } + auto bn = impl->make_bn(); + impl->state_stack_.emplace_back(std::in_place_type_t{}, bn); + impl->update_current_state(); + return bn; + } + return Node::make_null(); + } + + void XMLQuadIterator::Impl::PredicateState::on_characters([[maybe_unused]] Impl *impl, std::string_view const chars) { + if (done) { + if (!trim(chars).empty()) { + impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal"); + } + return; + } + literal.append(chars); + } + void XMLQuadIterator::Impl::PredicateState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + if (!trim(literal).empty()) { + impl->add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element"); + return; + } + if (done) { + impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found element"); + return; + } + auto const obj = DescriptionState::try_enter(impl, local_name, uri, attributes); + if (!obj.null()) { + std::get(impl->state_stack_[impl->state_stack_.size()-2]).done = true; + impl->add_statement(subject, predicate, obj); + return; + } + impl->add_error(ParsingError::Type::BadSyntax, "expected Description or literal, found element"); + } + void XMLQuadIterator::Impl::PredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + if (!done) { + Literal lit = Literal::make_null(); + try { + lit = Literal::make_simple(literal); + } catch (std::runtime_error const &e) { // InvalidNode is subclass + impl->add_error(ParsingError::Type::BadLiteral, e.what()); + } + impl->add_statement(subject, predicate, lit); + } + impl->pop_state(Node::make_null()); + } + + void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + impl->add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); + } + void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + if (!datatype.null()) { + Literal lit = Literal::make_null(); + try { + lit = Literal::make_typed(literal, datatype); + } catch (std::runtime_error const &e) { // InvalidNode is subclass + impl->add_error(ParsingError::Type::BadLiteral, e.what()); + } + impl->add_statement(subject, predicate, lit); + } + impl->pop_state(Node::make_null()); + } + + void XMLQuadIterator::Impl::EmptyElement::on_characters(Impl *impl, std::string_view const chars) { + if (!trim(chars).empty()) { + impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); + } + } + void XMLQuadIterator::Impl::EmptyElement::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???"); + } + void XMLQuadIterator::Impl::EmptyElement::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + impl->pop_state(Node::make_null()); + } + + XMLQuadIterator::Impl::Impl(std::istream &stream) + : handler_(make_sax_handler()), + context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "mem")), + stream_(stream) { + state_stack_.emplace_back(std::in_place_type_t{}); + update_current_state(); + } + + std::optional XMLQuadIterator::Impl::next() { + char buffer[1024]; + while (result_queue_.empty() && stream_.good() && !stream_.eof()) { + stream_.read(static_cast(buffer), sizeof(buffer)); + xmlParseChunk(context_.get(), static_cast(buffer), static_cast(stream_.gcount()), stream_.eof()); + } + if (result_queue_.empty()) { + return std::nullopt; + } + auto r = result_queue_.front(); + result_queue_.pop_front(); + return r; + } + + + XMLQuadIterator::XMLQuadIterator(std::istream &stream) + : impl_(std::make_unique(stream)), cur_(impl_->next()) { + } + XMLQuadIterator::~XMLQuadIterator() noexcept = default; + + XMLQuadIterator::reference XMLQuadIterator::operator*() const noexcept { + return *cur_; + } + XMLQuadIterator::pointer XMLQuadIterator::operator->() const noexcept { + return &*cur_; + } + XMLQuadIterator &XMLQuadIterator::operator++() { + cur_ = impl_->next(); + return *this; + } + bool XMLQuadIterator::operator==(std::default_sentinel_t) const noexcept { + return !cur_.has_value(); + } +} // namespace rdf4cpp::parser diff --git a/src/rdf4cpp/parser/XMLParser.hpp b/src/rdf4cpp/parser/XMLParser.hpp new file mode 100644 index 00000000..a1e8919d --- /dev/null +++ b/src/rdf4cpp/parser/XMLParser.hpp @@ -0,0 +1,55 @@ +#ifndef RDF4CPP_XMLPARSER_HPP +#define RDF4CPP_XMLPARSER_HPP + +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace rdf4cpp::parser { + struct XMLQuadIterator { + using flags_type = ParsingFlags; + using state_type = ParsingState; + using ok_type = Quad; + using error_type = ParsingError; + + using value_type = nonstd::expected; + using reference = value_type const &; + using pointer = value_type const *; + using difference_type = std::ptrdiff_t; + using iterator_category = std::input_iterator_tag; + using istream_type = std::istream; + + private: + struct Impl; + + std::unique_ptr impl_; + std::optional> cur_; + + public: + explicit XMLQuadIterator(std::istream& stream); + + XMLQuadIterator(XMLQuadIterator&&) noexcept = delete; + XMLQuadIterator& operator=(XMLQuadIterator&&) noexcept = delete; + + XMLQuadIterator(XMLQuadIterator const &) = delete; + XMLQuadIterator& operator=(XMLQuadIterator const &) = delete; + + ~XMLQuadIterator() noexcept; + + reference operator*() const noexcept; + pointer operator->() const noexcept; + XMLQuadIterator &operator++(); + + bool operator==(std::default_sentinel_t) const noexcept; + }; +} + +#endif //RDF4CPP_XMLPARSER_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 548eb1d8..7cfbed9b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -354,6 +354,13 @@ target_link_libraries(tests_Serialization ) add_test(NAME tests_Serialization COMMAND tests_Serialization) +add_executable(tests_XMLParser parser/tests_XMLParser.cpp) +target_link_libraries(tests_XMLParser + doctest::doctest + rdf4cpp +) +add_test(NAME tests_XMLParser COMMAND tests_XMLParser) + if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.nt") file(DOWNLOAD "https://files.tentris.dev/swdf.zip" "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.zip") execute_process(COMMAND unzip "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.zip" -d "${CMAKE_CURRENT_BINARY_DIR}/test_swdf") diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp new file mode 100644 index 00000000..0e427698 --- /dev/null +++ b/tests/parser/tests_XMLParser.cpp @@ -0,0 +1,118 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include "rdf4cpp/parser/XMLParser.hpp" + + +#include + +#include +#include + +#include + + +using namespace rdf4cpp; +using namespace rdf4cpp::parser; + +TEST_CASE("sanity test") { + std::stringstream str{R"( + + example + 42 + not a number + + + true + + + other example + + + + + blank example + + + + + blank example 2 + + + +)"}; + + XMLQuadIterator it{str}; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/title")); + CHECK(it->value().object() == Literal::make_simple("example")); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/cost")); + CHECK(it->value().object() == Literal::make_typed_from_value(42)); + ++it; + CHECK(it != std::default_sentinel); + CHECK(!it->has_value()); + CHECK(it->error().error_type == ParsingError::Type::BadLiteral); + CHECK(it->error().message == "http://www.w3.org/2001/XMLSchema#int parsing error: found n, invalid for datatype"); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/author")); + CHECK(it->value().object() == IRI::make("https://www.example2.com")); + ++it; + CHECK(it != std::default_sentinel); + CHECK(!it->has_value()); + CHECK(it->error().error_type == ParsingError::Type::BadIri); + CHECK(it->error().message == "htt?ps://example: InvalidScheme"); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/released")); + CHECK(it->value().object() == Literal::make_typed_from_value(true)); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/recommended")); + CHECK(it->value().object() == IRI::make("https://www.other_example.com")); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.other_example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/title")); + CHECK(it->value().object() == Literal::make_simple("other example")); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/recommended")); + CHECK(it->value().object().is_blank_node()); + auto bn = it->value().object(); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == bn); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/title")); + CHECK(it->value().object() == Literal::make_simple("blank example")); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/recommended")); + CHECK(it->value().object().is_blank_node()); + CHECK(it->value().object() != bn); + auto bn2 = it->value().object(); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == bn2); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/title")); + CHECK(it->value().object() == Literal::make_simple("blank example 2")); + ++it; + CHECK(it == std::default_sentinel); +} From 2c92962c7d1974a8630fe3188a2ef7b4e14fa10d Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 6 Nov 2025 17:27:50 +0100 Subject: [PATCH 02/42] api, remove uneccesary graph --- src/rdf4cpp/parser/XMLParser.cpp | 54 ++++++++++++++++++++------------ src/rdf4cpp/parser/XMLParser.hpp | 30 ++++++++++++++++++ 2 files changed, 64 insertions(+), 20 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index dba409e7..25261472 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -8,7 +8,10 @@ namespace rdf4cpp::parser { private: xmlSAXHandler handler_; std::unique_ptr context_; - std::istream &stream_; // NOLINT(*-avoid-const-or-ref-data-members) + void* reader_obj_; + ReadFunc read_func_; + ErrorFunc error_func_; + EOFFunc eof_func_; std::deque result_queue_; size_t next_bn_index_ = 0; @@ -114,9 +117,8 @@ namespace rdf4cpp::parser { * @param subject * @param predicate * @param object - * @param graph */ - void add_statement(Node subject, IRI predicate, Node object, std::optional graph = std::nullopt); + void add_statement(Node subject, IRI predicate, Node object); void update_current_state(); void pop_state(Node object); static std::string_view trim(std::string_view v); @@ -128,7 +130,7 @@ namespace rdf4cpp::parser { static void on_error(void *th, char const *msg, ...); public: - explicit Impl(std::istream &stream); + explicit Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof); std::optional next(); }; @@ -165,18 +167,11 @@ namespace rdf4cpp::parser { uint64_t const col = xmlSAX2GetColumnNumber(context_.get()); result_queue_.emplace_back(nonstd::unexpect, ty, lin, col, std::move(msg)); } - void XMLQuadIterator::Impl::add_statement(Node subject, IRI predicate, Node object, std::optional graph) { + void XMLQuadIterator::Impl::add_statement(Node subject, IRI predicate, Node object) { if (subject.null() || predicate.null() || object.null()) { return; } - if (graph.has_value()) { - if (graph->null()) { - return; - } - result_queue_.emplace_back(Quad(*graph, subject, predicate, object)); - } else { - result_queue_.emplace_back(Quad(subject, predicate, object)); - } + result_queue_.emplace_back(Quad(subject, predicate, object)); } void XMLQuadIterator::Impl::update_current_state() { if (state_stack_.empty()) { @@ -387,19 +382,19 @@ namespace rdf4cpp::parser { impl->pop_state(Node::make_null()); } - XMLQuadIterator::Impl::Impl(std::istream &stream) + XMLQuadIterator::Impl::Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof) : handler_(make_sax_handler()), context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "mem")), - stream_(stream) { + reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof) { state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); } std::optional XMLQuadIterator::Impl::next() { - char buffer[1024]; - while (result_queue_.empty() && stream_.good() && !stream_.eof()) { - stream_.read(static_cast(buffer), sizeof(buffer)); - xmlParseChunk(context_.get(), static_cast(buffer), static_cast(stream_.gcount()), stream_.eof()); + std::array buffer; + while (result_queue_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) { + auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_); + xmlParseChunk(context_.get(), buffer.data(), read, eof_func_(reader_obj_) != 0); } if (result_queue_.empty()) { return std::nullopt; @@ -410,8 +405,27 @@ namespace rdf4cpp::parser { } + XMLQuadIterator::XMLQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof) + : impl_(std::make_unique(stream, read, error, eof)), cur_(impl_->next()) { + } XMLQuadIterator::XMLQuadIterator(std::istream &stream) - : impl_(std::make_unique(stream)), cur_(impl_->next()) { + : XMLQuadIterator(&stream, + [](void *buf, [[maybe_unused]] size_t elem_size, size_t count, void *voided_self) noexcept -> size_t { + RDF4CPP_ASSERT(elem_size == 1); + + auto *self = static_cast(voided_self); + self->read(static_cast(buf), static_cast(count)); + return self->gcount(); + }, + [](void *voided_self) noexcept { + auto *self = static_cast(voided_self); + return static_cast(self->fail() && !self->eof()); + }, + [](void *voided_self) noexcept { + auto *self = static_cast(voided_self); + return static_cast(self->eof()); + }) + { } XMLQuadIterator::~XMLQuadIterator() noexcept = default; diff --git a/src/rdf4cpp/parser/XMLParser.hpp b/src/rdf4cpp/parser/XMLParser.hpp index a1e8919d..b9865ec8 100644 --- a/src/rdf4cpp/parser/XMLParser.hpp +++ b/src/rdf4cpp/parser/XMLParser.hpp @@ -27,6 +27,35 @@ namespace rdf4cpp::parser { using iterator_category = std::input_iterator_tag; using istream_type = std::istream; + /** + * Identical semantics to fread. + * Uses stream to read at most count elements of size element_size into buffer. + * + * @param buffer pointer to buffer with at least count elements of size elem_size + * @param elem_size sizeof each element + * @param count number of elements to read + * @param stream pointer to any object. + * @return number of elements read + */ + using ReadFunc = size_t (*)(void *buffer, size_t elem_size, size_t count, void *stream); + + /** + * Identical semantics to ferror. + * + * @param stream pointer to any object + * @return nonzero value if there is an error in stream, zero value otherwise + */ + using ErrorFunc = int (*)(void *stream); + + /** + * Identical semantics to feof. + * + * + * @param stream pointer to any object + * @return nonzero value if there is an error in stream, zero value otherwise + */ + using EOFFunc = int (*)(void *stream); + private: struct Impl; @@ -34,6 +63,7 @@ namespace rdf4cpp::parser { std::optional> cur_; public: + XMLQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof); explicit XMLQuadIterator(std::istream& stream); XMLQuadIterator(XMLQuadIterator&&) noexcept = delete; From 105e65d83901560fb18eb1ce9704bf866bf293d1 Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 7 Nov 2025 14:43:08 +0100 Subject: [PATCH 03/42] fix sanitizer --- src/rdf4cpp/parser/XMLParser.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 25261472..12e4825b 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -65,7 +65,8 @@ namespace rdf4cpp::parser { : subject(sub) { } - static Node try_enter(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes); + template + static bool try_enter(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes, F f); static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; @@ -257,7 +258,7 @@ namespace rdf4cpp::parser { } } void XMLQuadIterator::Impl::RDFState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - if (!DescriptionState::try_enter(impl, local_name, uri, attributes).null()) { + if (DescriptionState::try_enter(impl, local_name, uri, attributes, [](auto) {})) { return; } impl->add_error(ParsingError::Type::BadSyntax, "expected Description, found ???"); @@ -297,22 +298,25 @@ namespace rdf4cpp::parser { void XMLQuadIterator::Impl::DescriptionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { impl->pop_state(subject); } - Node XMLQuadIterator::Impl::DescriptionState::try_enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + template + bool XMLQuadIterator::Impl::DescriptionState::try_enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { if (iri_equal_pieces(start_element, uri, local_name)) { for (auto const &att : attributes) { if (iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { IRI iri = impl->try_make_iri(att.value()); + f(iri); impl->state_stack_.emplace_back(std::in_place_type_t{}, iri); impl->update_current_state(); - return iri; + return true; } } auto bn = impl->make_bn(); + f(bn); impl->state_stack_.emplace_back(std::in_place_type_t{}, bn); impl->update_current_state(); - return bn; + return true; } - return Node::make_null(); + return false; } void XMLQuadIterator::Impl::PredicateState::on_characters([[maybe_unused]] Impl *impl, std::string_view const chars) { @@ -333,10 +337,10 @@ namespace rdf4cpp::parser { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found element"); return; } - auto const obj = DescriptionState::try_enter(impl, local_name, uri, attributes); - if (!obj.null()) { - std::get(impl->state_stack_[impl->state_stack_.size()-2]).done = true; + if (DescriptionState::try_enter(impl, local_name, uri, attributes, [&](Node obj) { + done = true; impl->add_statement(subject, predicate, obj); + })) { return; } impl->add_error(ParsingError::Type::BadSyntax, "expected Description or literal, found element"); From 24b3c0376994a5380cc70e41c33e040e82b90fac Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 7 Nov 2025 15:11:08 +0100 Subject: [PATCH 04/42] cleanup --- src/rdf4cpp/parser/XMLParser.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 12e4825b..6974437e 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -395,10 +395,10 @@ namespace rdf4cpp::parser { } std::optional XMLQuadIterator::Impl::next() { - std::array buffer; + std::array buffer; // NOLINT(*-pro-type-member-init) while (result_queue_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) { auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_); - xmlParseChunk(context_.get(), buffer.data(), read, eof_func_(reader_obj_) != 0); + xmlParseChunk(context_.get(), buffer.data(), static_cast(read), eof_func_(reader_obj_) != 0); } if (result_queue_.empty()) { return std::nullopt; From 4c1b52e1da5bba353a0ffdc094781373bb0f0e20 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 12 Nov 2025 13:42:37 +0100 Subject: [PATCH 05/42] entities --- src/rdf4cpp/parser/XMLParser.cpp | 4 +++ tests/parser/tests_XMLParser.cpp | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 6974437e..667dbb33 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -140,6 +140,9 @@ namespace rdf4cpp::parser { xmlSAXHandler r{}; std::memset(&r, 0, sizeof(xmlSAXHandler)); r.initialized = XML_SAX2_MAGIC; + r.getParameterEntity = [](void *, xmlChar const *e) { + return xmlGetPredefinedEntity(e); + }; r.getEntity = [](void *, xmlChar const *e) { return xmlGetPredefinedEntity(e); }; @@ -390,6 +393,7 @@ namespace rdf4cpp::parser { : handler_(make_sax_handler()), context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "mem")), reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof) { + xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); } diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 0e427698..84644595 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -116,3 +116,51 @@ TEST_CASE("sanity test") { ++it; CHECK(it == std::default_sentinel); } + +TEST_CASE("rdf") { + // adapted from https://github.com/w3c/rdf-tests/tree/main/rdf/rdf11/rdf-xml + + std::string xml = ""; + std::string nt = ""; + SUBCASE("amp") { + xml = R"( + + + + xxx + + + xxx + + +)"; + nt = R"( "xxx" . + "xxx" .)"; + } + + if (xml.empty()) { + return; + } + + std::stringstream xml_str{xml}; + XMLQuadIterator xml_iter{xml_str}; + + std::stringstream nt_str{nt}; + IStreamQuadIterator nt_iter{nt_str, ParsingFlag::NTriples}; + + while (nt_iter != std::default_sentinel) { + REQUIRE(xml_iter != std::default_sentinel); + if (!xml_iter->has_value()) { + FAIL(xml_iter->error().message); + } + REQUIRE(nt_iter->has_value()); + CHECK(xml_iter->value().subject() == nt_iter->value().subject()); + CHECK(xml_iter->value().predicate() == nt_iter->value().predicate()); + CHECK(xml_iter->value().object() == nt_iter->value().object()); + + ++xml_iter; + ++nt_iter; + } + + REQUIRE(xml_iter == std::default_sentinel); +} From e518b99c2b79b9ffb81f21c5d019d13ecd7f6084 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 12 Nov 2025 15:44:13 +0100 Subject: [PATCH 06/42] more syntax+tests --- src/rdf4cpp/parser/XMLParser.cpp | 94 +++++++++++++++++++++++++++----- tests/parser/tests_XMLParser.cpp | 51 +++++++++++++++++ 2 files changed, 130 insertions(+), 15 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 667dbb33..3ca27d68 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -14,6 +14,7 @@ namespace rdf4cpp::parser { EOFFunc eof_func_; std::deque result_queue_; size_t next_bn_index_ = 0; + IRIFactory factory_; struct Attribute { xmlChar const *local_name_raw; @@ -46,6 +47,8 @@ namespace rdf4cpp::parser { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; }; struct RDFState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -70,6 +73,9 @@ namespace rdf4cpp::parser { static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; + static constexpr std::string_view id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"; + static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; + static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; }; struct PredicateState : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -205,7 +211,7 @@ namespace rdf4cpp::parser { return full_iri.starts_with(uri) && full_iri.ends_with(local_name); } IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const iri) { - auto exp = IRIFactory::create_and_validate(iri); + auto exp = factory_.from_maybe_relative(iri); if (exp.has_value()) { return *exp; } else { @@ -245,6 +251,14 @@ namespace rdf4cpp::parser { } void XMLQuadIterator::Impl::EmptyState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { + for (const auto& a : attributes) { + if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) + { + if (auto r = impl->factory_.set_base(a.value()); r != IRIFactoryError::Ok) { + impl->add_error(ParsingError::Type::BadIri, std::format("invalid IRI ({}): {}", r, a.value())); + } + } + } impl->state_stack_.emplace_back(std::in_place_type_t{}); impl->update_current_state(); return; @@ -303,23 +317,73 @@ namespace rdf4cpp::parser { } template bool XMLQuadIterator::Impl::DescriptionState::try_enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { - if (iri_equal_pieces(start_element, uri, local_name)) { - for (auto const &att : attributes) { - if (iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { - IRI iri = impl->try_make_iri(att.value()); - f(iri); - impl->state_stack_.emplace_back(std::in_place_type_t{}, iri); - impl->update_current_state(); - return true; + + Node sub = Node::make_null(); + auto check_only_one = [&sub, impl]() { + if (!sub.null()) { + impl->add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:ID, rdf:about, and rdf:nodeID"); + return true; + } + return false; + }; + for (auto const &att : attributes) { + if (iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { + if (check_only_one()) { + continue; + ; + } + sub = impl->try_make_iri(att.value()); + } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + if (check_only_one()) { + continue; + ; } + std::string i = "#"; + i.append(att.value()); + sub = impl->try_make_iri(i); + } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { // TODO test case + if (check_only_one()) { + continue; + ; + } + sub = BlankNode::make(att.value()); } - auto bn = impl->make_bn(); - f(bn); - impl->state_stack_.emplace_back(std::in_place_type_t{}, bn); - impl->update_current_state(); - return true; } - return false; + if (sub.null()) + { + sub = impl->make_bn(); + } + if (!iri_equal_pieces(start_element, uri, local_name)) { + IRI const obj = impl->try_make_iri(uri, local_name); + if (!obj.null()) + { + impl->add_statement(sub, IRI::rdf_type(), obj); + } + } + for (auto const &att : attributes) { + if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { // TODO needs test + IRI const obj = impl->try_make_iri(att.value()); + if (obj.null()) + { + continue; + } + impl->add_statement(sub, IRI::rdf_type(), obj); + } else if (iri_equal_pieces(about_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + continue; + } else { // TODO tests say this is correct, spec does not??? + IRI const pred = impl->try_make_iri(att.uri(), att.local_name()); + Literal const obj = Literal::make_simple(att.value()); + if (pred.null() || obj.null()) + { + continue; + } + impl->add_statement(sub, pred, obj); + } + } + f(sub); + impl->state_stack_.emplace_back(std::in_place_type_t{}, sub); + impl->update_current_state(); + return true; } void XMLQuadIterator::Impl::PredicateState::on_characters([[maybe_unused]] Impl *impl, std::string_view const chars) { diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 84644595..9b9ce8c2 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -122,6 +122,43 @@ TEST_CASE("rdf") { std::string xml = ""; std::string nt = ""; + + + SUBCASE("syntax 1 (base applies to id)") { + xml = R"( + + + + +)"; + nt = R"( "v" .)"; + } + SUBCASE("syntax 2 (base applies to resource)") { + xml = R"( + + + + + + +)"; + nt = R"( .)"; + } + SUBCASE("syntax 3 (base applies to about)") { + xml = R"( + + + + +)"; + nt = R"( .)"; + } SUBCASE("amp") { xml = R"( @@ -137,6 +174,20 @@ TEST_CASE("rdf") { nt = R"( "xxx" . "xxx" .)"; } + SUBCASE("datatypes") { + xml = R"( + + + + 10 + 10 + + +)"; + nt = R"( "10"^^ . + "10"^^ .)"; + } if (xml.empty()) { return; From e1124293098116ac07310afce55ee835b44b0bf6 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 12 Nov 2025 16:20:41 +0100 Subject: [PATCH 07/42] base scoping --- src/rdf4cpp/IRIFactory.cpp | 4 ++ src/rdf4cpp/IRIFactory.hpp | 6 +++ src/rdf4cpp/parser/XMLParser.cpp | 83 ++++++++++++++++++++++---------- tests/parser/tests_XMLParser.cpp | 13 +++++ 4 files changed, 81 insertions(+), 25 deletions(-) diff --git a/src/rdf4cpp/IRIFactory.cpp b/src/rdf4cpp/IRIFactory.cpp index 52fdd444..4e1e8687 100644 --- a/src/rdf4cpp/IRIFactory.cpp +++ b/src/rdf4cpp/IRIFactory.cpp @@ -292,5 +292,9 @@ IRIFactoryError IRIFactory::set_base(std::string_view b) noexcept { base_parts_cache = IRIView{base}.all_parts(); return IRIFactoryError::Ok; } +void IRIFactory::set_base_unchecked(std::string_view b) noexcept { + base = b; + base_parts_cache = IRIView{base}.all_parts(); +} } // namespace rdf4cpp diff --git a/src/rdf4cpp/IRIFactory.hpp b/src/rdf4cpp/IRIFactory.hpp index 4ba20526..d2549f7d 100644 --- a/src/rdf4cpp/IRIFactory.hpp +++ b/src/rdf4cpp/IRIFactory.hpp @@ -105,6 +105,12 @@ struct IRIFactory { * @return */ [[nodiscard]] IRIFactoryError set_base(std::string_view b) noexcept; + /** + * Changes the base IRI. Skips validating the new base IRI. + * @param b + * @return + */ + void set_base_unchecked(std::string_view b) noexcept; /** * validates the given IRI and creates it in the given node storage, if valid. diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 3ca27d68..4c0e5836 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -2,6 +2,7 @@ #include #include +#include namespace rdf4cpp::parser { struct XMLQuadIterator::Impl { @@ -41,14 +42,17 @@ namespace rdf4cpp::parser { virtual void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) = 0; // TODO remove params? virtual void re_enter([[maybe_unused]] Impl *impl, [[maybe_unused]] Node obj) { // TODO remove if not needed for something } + + std::string base; + + static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; + static std::string_view try_handle_base_attrib(Impl* impl, std::span attributes); }; struct EmptyState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; - - static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; }; struct RDFState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -130,8 +134,8 @@ namespace rdf4cpp::parser { void pop_state(Node object); static std::string_view trim(std::string_view v); static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); - IRI try_make_iri(std::string_view iri); - IRI try_make_iri(std::string_view uri, std::string_view local_name); + IRI try_make_iri(std::string_view iri, std::string_view base); + IRI try_make_iri(std::string_view uri, std::string_view local_name, std::string_view base); BlankNode make_bn(); static void on_error(void *th, char const *msg, ...); @@ -186,6 +190,7 @@ namespace rdf4cpp::parser { void XMLQuadIterator::Impl::update_current_state() { if (state_stack_.empty()) { current_state_ = nullptr; + return; } current_state_ = std::visit([](auto &s) -> BaseState * { return &s; }, state_stack_.back()); } @@ -210,7 +215,19 @@ namespace rdf4cpp::parser { } return full_iri.starts_with(uri) && full_iri.ends_with(local_name); } - IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const iri) { + IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const iri, std::string_view base) { + if (base.empty()) { + for (const auto &s : state_stack_ | std::ranges::views::reverse) { + auto const v = std::visit([](const auto& s) -> std::string_view { return s.base; }, s); + if (!v.empty()) { + factory_.set_base_unchecked(v); + break; + } + } + } + else { + factory_.set_base_unchecked(base); + } auto exp = factory_.from_maybe_relative(iri); if (exp.has_value()) { return *exp; @@ -219,10 +236,10 @@ namespace rdf4cpp::parser { return IRI::make_null(); } } - IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const uri, std::string_view const local_name) { + IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const uri, std::string_view const local_name, std::string_view base) { std::string iri{uri}; iri.append(local_name); - return try_make_iri(iri); + return try_make_iri(iri, base); } BlankNode XMLQuadIterator::Impl::make_bn() { return BlankNode::make_unchecked(std::format("bn_{}", next_bn_index_++)); @@ -244,6 +261,18 @@ namespace rdf4cpp::parser { va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } + std::string_view XMLQuadIterator::Impl::BaseState::try_handle_base_attrib(Impl* impl, std::span attributes) { + for (const auto& a : attributes) { + if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) + { + if (auto r = IRIView(a.value()).quick_validate(); r != IRIFactoryError::Ok) { + impl->add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", r, a.value())); + } + return a.value(); + } + } + return ""; + } void XMLQuadIterator::Impl::EmptyState::on_characters(Impl *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); @@ -251,16 +280,12 @@ namespace rdf4cpp::parser { } void XMLQuadIterator::Impl::EmptyState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { - for (const auto& a : attributes) { - if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) - { - if (auto r = impl->factory_.set_base(a.value()); r != IRIFactoryError::Ok) { - impl->add_error(ParsingError::Type::BadIri, std::format("invalid IRI ({}): {}", r, a.value())); - } - } - } + auto const base = try_handle_base_attrib(impl, attributes); impl->state_stack_.emplace_back(std::in_place_type_t{}); impl->update_current_state(); + if (!base.empty()) { + impl->current_state_->base = base; + } return; } impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); @@ -290,16 +315,17 @@ namespace rdf4cpp::parser { } } void XMLQuadIterator::Impl::DescriptionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { + auto base = try_handle_base_attrib(impl, attributes); std::string s{uri}; s.append(local_name); - auto predicate = impl->try_make_iri(s); + auto predicate = impl->try_make_iri(s, base); std::optional datatype = std::nullopt; std::optional ref = std::nullopt; for (auto const &att : attributes) { if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { - datatype = impl->try_make_iri(att.value()); + datatype = impl->try_make_iri(att.value(), base); } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { - ref = impl->try_make_iri(att.value()); + ref = impl->try_make_iri(att.value(), base); } } if (datatype.has_value()) { @@ -311,13 +337,16 @@ namespace rdf4cpp::parser { impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); } impl->update_current_state(); + if (!base.empty()) { + impl->current_state_->base = base; + } } void XMLQuadIterator::Impl::DescriptionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { impl->pop_state(subject); } template bool XMLQuadIterator::Impl::DescriptionState::try_enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { - + auto base = try_handle_base_attrib(impl, attributes); Node sub = Node::make_null(); auto check_only_one = [&sub, impl]() { if (!sub.null()) { @@ -332,7 +361,7 @@ namespace rdf4cpp::parser { continue; ; } - sub = impl->try_make_iri(att.value()); + sub = impl->try_make_iri(att.value(), base); } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; @@ -340,7 +369,7 @@ namespace rdf4cpp::parser { } std::string i = "#"; i.append(att.value()); - sub = impl->try_make_iri(i); + sub = impl->try_make_iri(i, base); } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { // TODO test case if (check_only_one()) { continue; @@ -354,7 +383,7 @@ namespace rdf4cpp::parser { sub = impl->make_bn(); } if (!iri_equal_pieces(start_element, uri, local_name)) { - IRI const obj = impl->try_make_iri(uri, local_name); + IRI const obj = impl->try_make_iri(uri, local_name, base); if (!obj.null()) { impl->add_statement(sub, IRI::rdf_type(), obj); @@ -362,16 +391,16 @@ namespace rdf4cpp::parser { } for (auto const &att : attributes) { if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { // TODO needs test - IRI const obj = impl->try_make_iri(att.value()); + IRI const obj = impl->try_make_iri(att.value(), base); if (obj.null()) { continue; } impl->add_statement(sub, IRI::rdf_type(), obj); - } else if (iri_equal_pieces(about_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + } else if (iri_equal_pieces(about_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(base_attribute, att.uri(), att.local_name())) { continue; } else { // TODO tests say this is correct, spec does not??? - IRI const pred = impl->try_make_iri(att.uri(), att.local_name()); + IRI const pred = impl->try_make_iri(att.uri(), att.local_name(), base); Literal const obj = Literal::make_simple(att.value()); if (pred.null() || obj.null()) { @@ -383,6 +412,9 @@ namespace rdf4cpp::parser { f(sub); impl->state_stack_.emplace_back(std::in_place_type_t{}, sub); impl->update_current_state(); + if (!base.empty()) { + impl->current_state_->base = base; + } return true; } @@ -460,6 +492,7 @@ namespace rdf4cpp::parser { xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); + current_state_->base = IRIFactory::default_base; } std::optional XMLQuadIterator::Impl::next() { diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 9b9ce8c2..2a2d8abb 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -159,6 +159,19 @@ TEST_CASE("rdf") { )"; nt = R"( .)"; } + SUBCASE("syntax 4 (base scoping)") { + xml = R"( + + + + + +)"; + nt = R"( "v" . + .)"; + } SUBCASE("amp") { xml = R"( From d040b2e8bcf352e2af47f30904f0ae5693700dcb Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 12 Nov 2025 16:43:04 +0100 Subject: [PATCH 08/42] more tests --- tests/parser/tests_XMLParser.cpp | 95 +++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 2a2d8abb..c7588e18 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -159,7 +159,8 @@ TEST_CASE("rdf") { )"; nt = R"( .)"; } - SUBCASE("syntax 4 (base scoping)") { + // case 4 needs reification + SUBCASE("syntax 6 (base scoping)") { xml = R"( )"; nt = R"( "v" . .)"; + } + SUBCASE("syntax 7 (relative resolution)") { + xml = R"( + + + + + +)"; + nt = R"( .)"; + } + SUBCASE("syntax 8 (empty local)") { + xml = R"( + + + + + +)"; + nt = R"( .)"; + } + SUBCASE("syntax 9 (absolute path)") { + xml = R"( + + + + + +)"; + nt = R"( .)"; + } + SUBCASE("syntax 10 (absolute host)") { + xml = R"( + + + + + +)"; + nt = R"( .)"; + } + SUBCASE("syntax 11 (base without path)") { + xml = R"( + + + + + +)"; + nt = R"( .)"; + } + SUBCASE("syntax 13 (base with fragment)") { + xml = R"( + + + + + + + + +)"; + nt = R"( . + .)"; + } + SUBCASE("syntax 14 (same ids)") { + xml = R"( + + + + + + +)"; + nt = R"( "v" . + "v" .)"; } SUBCASE("amp") { xml = R"( From cfd76647a410f2a763cbb4787a635e3bbbe263e0 Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 13 Nov 2025 13:36:29 +0100 Subject: [PATCH 09/42] parsing state --- src/rdf4cpp/parser/XMLParser.cpp | 114 ++++++++++++++++++++++--------- src/rdf4cpp/parser/XMLParser.hpp | 4 +- 2 files changed, 83 insertions(+), 35 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 4c0e5836..4e84ef7c 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -15,7 +15,8 @@ namespace rdf4cpp::parser { EOFFunc eof_func_; std::deque result_queue_; size_t next_bn_index_ = 0; - IRIFactory factory_; + std::unique_ptr owned_state_; + state_type *state_; struct Attribute { xmlChar const *local_name_raw; @@ -134,14 +135,17 @@ namespace rdf4cpp::parser { void pop_state(Node object); static std::string_view trim(std::string_view v); static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); + template + NT inspect_node(NT node); IRI try_make_iri(std::string_view iri, std::string_view base); IRI try_make_iri(std::string_view uri, std::string_view local_name, std::string_view base); - BlankNode make_bn(); + Node make_bn(std::optional name); + Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag); static void on_error(void *th, char const *msg, ...); public: - explicit Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof); + explicit Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state); std::optional next(); }; @@ -215,22 +219,36 @@ namespace rdf4cpp::parser { } return full_iri.starts_with(uri) && full_iri.ends_with(local_name); } + template + NT XMLQuadIterator::Impl::inspect_node(NT node) { + try { + state_->inspect_node_func(node); + return node; + } + catch (std::exception &e) { + add_error(ParsingError::Type::BadSyntax, std::format("Triple explicitly skipped by inspect function: {}", e.what())); + } + catch (...) { + add_error(ParsingError::Type::BadSyntax, "Triple explicitly skipped by inspect function"); + } + return NT::make_null(); + } IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const iri, std::string_view base) { if (base.empty()) { for (const auto &s : state_stack_ | std::ranges::views::reverse) { auto const v = std::visit([](const auto& s) -> std::string_view { return s.base; }, s); if (!v.empty()) { - factory_.set_base_unchecked(v); + state_->iri_factory.set_base_unchecked(v); break; } } } else { - factory_.set_base_unchecked(base); + state_->iri_factory.set_base_unchecked(base); } - auto exp = factory_.from_maybe_relative(iri); + auto exp = state_->iri_factory.from_maybe_relative(iri, state_->node_storage); if (exp.has_value()) { - return *exp; + return inspect_node(*exp); } else { add_error(ParsingError::Type::BadIri, std::format("{}: {}", iri, exp.error())); return IRI::make_null(); @@ -241,8 +259,50 @@ namespace rdf4cpp::parser { iri.append(local_name); return try_make_iri(iri, base); } - BlankNode XMLQuadIterator::Impl::make_bn() { - return BlankNode::make_unchecked(std::format("bn_{}", next_bn_index_++)); + Node XMLQuadIterator::Impl::make_bn(std::optional name) { + std::string n = ""; + if (!name.has_value()) { + n = std::format("bn_{}", next_bn_index_++); + name = n; + } + try { + if (state_->blank_node_scope_manager == nullptr) + { + return inspect_node(BlankNode::make(*name)); + } + else { + return inspect_node(state_->blank_node_scope_manager.scope("").get_or_generate_node(*name, state_->node_storage)); + } + } + catch (InvalidNode const &e) { + add_error(ParsingError::Type::BadBlankNode, e.what()); + return BlankNode::make_null(); + } + catch (...) { + add_error(ParsingError::Type::BadBlankNode, "unknown error"); + return BlankNode::make_null(); + } + } + Literal XMLQuadIterator::Impl::make_literal(std::string_view value, std::optional datatype, std::optional lang_tag) { + Literal l = Literal::make_null(); + try { + if (datatype.has_value()) { + l = Literal::make_typed(value, *datatype, state_->node_storage); + } + else if (lang_tag.has_value()) { + l = Literal::make_lang_tagged(value, *lang_tag, state_->node_storage); + } + else { + l = Literal::make_simple(value); + } + } + catch (InvalidNode const &e) { + add_error(ParsingError::Type::BadLiteral, e.what()); + } + catch (...) { + add_error(ParsingError::Type::BadLiteral, "unknown error"); + } + return inspect_node(l); } void XMLQuadIterator::Impl::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) va_list args; @@ -359,13 +419,11 @@ namespace rdf4cpp::parser { if (iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; - ; } sub = impl->try_make_iri(att.value(), base); } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; - ; } std::string i = "#"; i.append(att.value()); @@ -373,14 +431,13 @@ namespace rdf4cpp::parser { } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { // TODO test case if (check_only_one()) { continue; - ; } - sub = BlankNode::make(att.value()); + sub = impl->make_bn(att.value()); } } if (sub.null()) { - sub = impl->make_bn(); + sub = impl->make_bn(std::nullopt); } if (!iri_equal_pieces(start_element, uri, local_name)) { IRI const obj = impl->try_make_iri(uri, local_name, base); @@ -401,7 +458,7 @@ namespace rdf4cpp::parser { continue; } else { // TODO tests say this is correct, spec does not??? IRI const pred = impl->try_make_iri(att.uri(), att.local_name(), base); - Literal const obj = Literal::make_simple(att.value()); + Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); if (pred.null() || obj.null()) { continue; @@ -446,12 +503,7 @@ namespace rdf4cpp::parser { } void XMLQuadIterator::Impl::PredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { if (!done) { - Literal lit = Literal::make_null(); - try { - lit = Literal::make_simple(literal); - } catch (std::runtime_error const &e) { // InvalidNode is subclass - impl->add_error(ParsingError::Type::BadLiteral, e.what()); - } + Literal const lit = impl->make_literal(literal, std::nullopt, std::nullopt); impl->add_statement(subject, predicate, lit); } impl->pop_state(Node::make_null()); @@ -462,12 +514,7 @@ namespace rdf4cpp::parser { } void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { if (!datatype.null()) { - Literal lit = Literal::make_null(); - try { - lit = Literal::make_typed(literal, datatype); - } catch (std::runtime_error const &e) { // InvalidNode is subclass - impl->add_error(ParsingError::Type::BadLiteral, e.what()); - } + Literal const lit = impl->make_literal(literal, datatype, std::nullopt); impl->add_statement(subject, predicate, lit); } impl->pop_state(Node::make_null()); @@ -485,10 +532,11 @@ namespace rdf4cpp::parser { impl->pop_state(Node::make_null()); } - XMLQuadIterator::Impl::Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof) + XMLQuadIterator::Impl::Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state) : handler_(make_sax_handler()), context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "mem")), - reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof) { + reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof), + owned_state_(state == nullptr ? std::make_unique() : nullptr), state_(state == nullptr ? owned_state_.get() : state){ xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); @@ -510,10 +558,10 @@ namespace rdf4cpp::parser { } - XMLQuadIterator::XMLQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof) - : impl_(std::make_unique(stream, read, error, eof)), cur_(impl_->next()) { + XMLQuadIterator::XMLQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof, state_type* state) + : impl_(std::make_unique(stream, read, error, eof, state)), cur_(impl_->next()) { } - XMLQuadIterator::XMLQuadIterator(std::istream &stream) + XMLQuadIterator::XMLQuadIterator(std::istream &stream, state_type* state) : XMLQuadIterator(&stream, [](void *buf, [[maybe_unused]] size_t elem_size, size_t count, void *voided_self) noexcept -> size_t { RDF4CPP_ASSERT(elem_size == 1); @@ -529,7 +577,7 @@ namespace rdf4cpp::parser { [](void *voided_self) noexcept { auto *self = static_cast(voided_self); return static_cast(self->eof()); - }) + }, state) { } XMLQuadIterator::~XMLQuadIterator() noexcept = default; diff --git a/src/rdf4cpp/parser/XMLParser.hpp b/src/rdf4cpp/parser/XMLParser.hpp index b9865ec8..cfc0c341 100644 --- a/src/rdf4cpp/parser/XMLParser.hpp +++ b/src/rdf4cpp/parser/XMLParser.hpp @@ -63,8 +63,8 @@ namespace rdf4cpp::parser { std::optional> cur_; public: - XMLQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof); - explicit XMLQuadIterator(std::istream& stream); + XMLQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof, state_type* state = nullptr); + explicit XMLQuadIterator(std::istream& stream, state_type* state = nullptr); XMLQuadIterator(XMLQuadIterator&&) noexcept = delete; XMLQuadIterator& operator=(XMLQuadIterator&&) noexcept = delete; From db888f8913a04b485ecada5ac40f695cb1a1439b Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 13 Nov 2025 14:28:08 +0100 Subject: [PATCH 10/42] triples in properties --- src/rdf4cpp/parser/XMLParser.cpp | 46 ++++++++++++++++++++++++++++---- tests/parser/tests_XMLParser.cpp | 40 ++++++++++++++++++++++++--- 2 files changed, 78 insertions(+), 8 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 4e84ef7c..64d16955 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -47,6 +47,7 @@ namespace rdf4cpp::parser { std::string base; static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; + static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; static std::string_view try_handle_base_attrib(Impl* impl, std::span attributes); }; @@ -97,6 +98,10 @@ namespace rdf4cpp::parser { } static constexpr std::string_view resource_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource"; + static constexpr std::string_view parse_type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType"; + static constexpr std::string_view parse_type_resource = "Resource"; + static constexpr std::string_view parse_type_literal = "Literal"; + static constexpr std::string_view parse_type_collection = "Collection"; }; struct TypedLiteralPredicateState final : PredicateState { void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; @@ -380,19 +385,50 @@ namespace rdf4cpp::parser { s.append(local_name); auto predicate = impl->try_make_iri(s, base); std::optional datatype = std::nullopt; - std::optional ref = std::nullopt; + std::optional sub = std::nullopt; + bool parse_resource = false; for (auto const &att : attributes) { if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { datatype = impl->try_make_iri(att.value(), base); } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { - ref = impl->try_make_iri(att.value(), base); + sub = impl->try_make_iri(att.value(), base); + } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { + sub = impl->make_bn(att.value()); + } else if (iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { + if (att.value() == PredicateState::parse_type_resource) { + parse_resource = true; + } + } + } + for (auto const &att : attributes) { + if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name()) || + iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name()) || + iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || + iri_equal_pieces(PredicateState::parse_type_resource, att.uri(), att.local_name()) || + iri_equal_pieces(lang_attribute, att.uri(), att.local_name())) { + continue; + } + if (!sub.has_value()) { + sub = impl->make_bn(std::nullopt); + } + if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { + IRI const obj = impl->try_make_iri(att.value(), base); + impl->add_statement(*sub, IRI::rdf_type(), obj); + } else { + IRI const pred = impl->try_make_iri(att.uri(), att.local_name(), base); + Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); + impl->add_statement(*sub, pred, obj); } } if (datatype.has_value()) { impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, *datatype); - } else if (ref.has_value()) { - impl->add_statement(subject, predicate, *ref); + } else if (sub.has_value()) { + impl->add_statement(subject, predicate, *sub); impl->state_stack_.emplace_back(std::in_place_type_t{}); + } else if (parse_resource) { // TODO tests + Node obj = impl->make_bn(std::nullopt); + impl->add_statement(subject, predicate, obj); + impl->state_stack_.emplace_back(std::in_place_type_t{}, obj); } else { impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); } @@ -456,7 +492,7 @@ namespace rdf4cpp::parser { impl->add_statement(sub, IRI::rdf_type(), obj); } else if (iri_equal_pieces(about_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(base_attribute, att.uri(), att.local_name())) { continue; - } else { // TODO tests say this is correct, spec does not??? + } else { IRI const pred = impl->try_make_iri(att.uri(), att.local_name(), base); Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); if (pred.null() || obj.null()) diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index c7588e18..53a07e1a 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -294,6 +294,23 @@ TEST_CASE("rdf") { nt = R"( "10"^^ . "10"^^ .)"; } + SUBCASE("unicode literal") { + xml = R"( + + + + + + + + + +)"; + nt = R"(_:a "D\u00FCrst" . + _:a .)"; + } if (xml.empty()) { return; @@ -305,15 +322,32 @@ TEST_CASE("rdf") { std::stringstream nt_str{nt}; IStreamQuadIterator nt_iter{nt_str, ParsingFlag::NTriples}; + std::map bn_map{}; + auto check = [&bn_map](Node xml, Node nt) { + CHECK(xml.is_blank_node() == nt.is_blank_node()); + if (nt.is_blank_node()) { + auto i = bn_map.find(nt.as_blank_node()); + if (i != bn_map.end()) { + CHECK(xml.as_blank_node() == i->second.as_blank_node()); + } + else { + bn_map[nt.as_blank_node()] = xml.as_blank_node(); + } + } + else { + CHECK(xml == nt); + } + }; + while (nt_iter != std::default_sentinel) { REQUIRE(xml_iter != std::default_sentinel); if (!xml_iter->has_value()) { FAIL(xml_iter->error().message); } REQUIRE(nt_iter->has_value()); - CHECK(xml_iter->value().subject() == nt_iter->value().subject()); - CHECK(xml_iter->value().predicate() == nt_iter->value().predicate()); - CHECK(xml_iter->value().object() == nt_iter->value().object()); + check(xml_iter->value().subject() , nt_iter->value().subject()); + check(xml_iter->value().predicate(), nt_iter->value().predicate()); + check(xml_iter->value().object(), nt_iter->value().object()); ++xml_iter; ++nt_iter; From 55d2cc9ff637cf81c96446e1d26a129173a49cd4 Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 13 Nov 2025 15:16:35 +0100 Subject: [PATCH 11/42] more tests --- src/rdf4cpp/parser/XMLParser.cpp | 23 ++--- tests/parser/tests_XMLParser.cpp | 162 ++++++++++++++++++++++++++++++- 2 files changed, 173 insertions(+), 12 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 64d16955..a97bf86c 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -387,6 +387,8 @@ namespace rdf4cpp::parser { std::optional datatype = std::nullopt; std::optional sub = std::nullopt; bool parse_resource = false; + bool parse_literal = false; + bool parse_collection = false; for (auto const &att : attributes) { if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { datatype = impl->try_make_iri(att.value(), base); @@ -397,6 +399,10 @@ namespace rdf4cpp::parser { } else if (iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { if (att.value() == PredicateState::parse_type_resource) { parse_resource = true; + } else if (att.value() == PredicateState::parse_type_collection) { + parse_collection = true; + } else { + parse_literal = true; } } } @@ -420,6 +426,9 @@ namespace rdf4cpp::parser { impl->add_statement(*sub, pred, obj); } } + if (sub.has_value() && (parse_collection || parse_literal || parse_resource)) { + impl->add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); + } if (datatype.has_value()) { impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, *datatype); } else if (sub.has_value()) { @@ -479,26 +488,18 @@ namespace rdf4cpp::parser { IRI const obj = impl->try_make_iri(uri, local_name, base); if (!obj.null()) { - impl->add_statement(sub, IRI::rdf_type(), obj); + impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj); } } for (auto const &att : attributes) { - if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { // TODO needs test + if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { // TODO tests IRI const obj = impl->try_make_iri(att.value(), base); - if (obj.null()) - { - continue; - } - impl->add_statement(sub, IRI::rdf_type(), obj); + impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj); } else if (iri_equal_pieces(about_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(base_attribute, att.uri(), att.local_name())) { continue; } else { IRI const pred = impl->try_make_iri(att.uri(), att.local_name(), base); Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); - if (pred.null() || obj.null()) - { - continue; - } impl->add_statement(sub, pred, obj); } } diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 53a07e1a..032a50c3 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -117,7 +117,7 @@ TEST_CASE("sanity test") { CHECK(it == std::default_sentinel); } -TEST_CASE("rdf") { +TEST_CASE("rdf xml positive tests") { // adapted from https://github.com/w3c/rdf-tests/tree/main/rdf/rdf11/rdf-xml std::string xml = ""; @@ -311,6 +311,111 @@ TEST_CASE("rdf") { nt = R"(_:a "D\u00FCrst" . _:a .)"; } + SUBCASE("unicode iri 1") { + xml = R"( + + + + + + 2000 + +)"; + nt = R"( "2000" .)"; + } + SUBCASE("unicode iri 2") { + xml = R"( + + + + + + 2000 + +)"; + nt = R"( "2000" .)"; + } + SUBCASE("type instead of description") { + xml = R"( + + + + Dogs in Hats + + +)"; + nt = R"( . + "Dogs in Hats" .)"; + } + SUBCASE("id 1") { + xml = R"( + + + abc + +)"; + nt = R"( "abc" .)"; + } + SUBCASE("id 2") { + xml = R"( + + + abc + +)"; + nt = R"( "abc" .)"; + } + SUBCASE("id 3") { + xml = R"( + + + abc + +)"; + nt = R"( "abc" .)"; + } + SUBCASE("duplicate bag entries") { + xml = R"( + + + + + +)"; + nt = R"( . + . + .)"; + } + SUBCASE("empty property 1") { + xml = R"( + + + + + + +)"; + nt = R"( .)"; + } + SUBCASE("empty property 2") { + xml = R"( + + + + + + +)"; + nt = R"( "" .)"; + } if (xml.empty()) { return; @@ -355,3 +460,58 @@ TEST_CASE("rdf") { REQUIRE(xml_iter == std::default_sentinel); } + +TEST_CASE("rdf xml negative tests") { + // adapted from https://github.com/w3c/rdf-tests/tree/main/rdf/rdf11/rdf-xml + std::string xml = ""; + std::vector> expected_msg{}; + bool ignore_some_triples = false; + + SUBCASE("resource + parse type") { + xml = R"( + + + + + +)"; + expected_msg.emplace_back(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); + ignore_some_triples = true; + } + SUBCASE("implicit bn + parse type") { + xml = R"( + + + + + +)"; + expected_msg.emplace_back(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); + ignore_some_triples = true; + } + + if (xml.empty()) { + return; + } + + std::stringstream xml_str{xml}; + XMLQuadIterator xml_iter{xml_str}; + + while (xml_iter != std::default_sentinel) { + if (!ignore_some_triples) { + REQUIRE(!xml_iter->has_value()); + } else if (xml_iter->has_value()) { + ++xml_iter; + continue; + } + REQUIRE(!expected_msg.empty()); + CHECK(xml_iter->error().error_type == expected_msg.back().first); + CHECK(xml_iter->error().message == expected_msg.back().second); + expected_msg.pop_back(); + ++xml_iter; + } + REQUIRE(expected_msg.empty()); +} \ No newline at end of file From ede429fab8bad6e8ee019601e8fc0cf350381ea7 Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 14 Nov 2025 14:37:14 +0100 Subject: [PATCH 12/42] wip xml literal --- conanfile.py | 2 +- src/rdf4cpp/parser/XMLParser.cpp | 72 +++++++++++++++++++++++++-- tests/parser/tests_XMLParser.cpp | 84 ++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 6 deletions(-) diff --git a/conanfile.py b/conanfile.py index 3d766ff6..dfe224a2 100644 --- a/conanfile.py +++ b/conanfile.py @@ -39,7 +39,7 @@ def requirements(self): self.requires("dice-hash/0.4.11", transitive_headers=True) self.requires("dice-sparse-map/0.2.9", transitive_headers=True) self.requires("dice-template-library/1.13.0", transitive_headers=True) - self.requires("libxml2/2.14.5") + self.requires("libxml2/2.15.0") self.requires("zlib/1.3.1", force=True) if self.options.with_test_deps: diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index a97bf86c..d832f90e 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -115,6 +115,22 @@ namespace rdf4cpp::parser { static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; }; + struct XMLLiteralState final : PredicateState { + void on_characters(Impl *impl, std::string_view chars) override; + void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + size_t depth = 0; + size_t data_start = 0; + size_t last_offset = 0; + size_t last_size = 0; + + XMLLiteralState(Node iri, IRI predicate) + : PredicateState(iri, predicate) { + } + + void source_input(Impl *impl); + }; struct EmptyElement final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -123,7 +139,7 @@ namespace rdf4cpp::parser { }; BaseState *current_state_ = nullptr; - std::vector> state_stack_; + std::vector> state_stack_; static xmlSAXHandler make_sax_handler(); @@ -173,12 +189,12 @@ namespace rdf4cpp::parser { [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { auto *t = static_cast(th); - t->current_state_->on_start_element(t, reinterpret_cast(local_name), reinterpret_cast(uri), + t->current_state_->on_start_element(t, reinterpret_cast(local_name), uri == nullptr ? "" : reinterpret_cast(uri), std::span{reinterpret_cast(attributes), static_cast(n_attributes)}); }; r.endElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri) { auto *t = static_cast(th); - t->current_state_->on_end_element(t, reinterpret_cast(local_name), reinterpret_cast(uri)); + t->current_state_->on_end_element(t, reinterpret_cast(local_name), uri == nullptr ? "" : reinterpret_cast(uri)); }; r.warning = on_error; r.error = on_error; @@ -410,7 +426,7 @@ namespace rdf4cpp::parser { if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name()) || iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || - iri_equal_pieces(PredicateState::parse_type_resource, att.uri(), att.local_name()) || + iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name()) || iri_equal_pieces(lang_attribute, att.uri(), att.local_name())) { continue; } @@ -438,6 +454,9 @@ namespace rdf4cpp::parser { Node obj = impl->make_bn(std::nullopt); impl->add_statement(subject, predicate, obj); impl->state_stack_.emplace_back(std::in_place_type_t{}, obj); + } else if (parse_literal) { // TODO tests + auto& s = impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); + std::visit([&](T& o) { if constexpr (std::same_as) { o.source_input(impl); }}, s); } else { impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); } @@ -556,7 +575,50 @@ namespace rdf4cpp::parser { } impl->pop_state(Node::make_null()); } - + void XMLQuadIterator::Impl::XMLLiteralState::on_characters(Impl *impl, [[maybe_unused]] std::string_view chars) { + source_input(impl); + } + void XMLQuadIterator::Impl::XMLLiteralState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + ++depth; + source_input(impl); + } + void XMLQuadIterator::Impl::XMLLiteralState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + if (depth > 0) { + --depth; + source_input(impl); + return; + } + IRI datatype = impl->try_make_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral", ""); + std::string_view l = literal; + l = l.substr(0, last_offset); + l.remove_prefix(data_start); + if (l.size() > 0 && l[0] == '/') + { + l.remove_prefix(1); + } + if (l.size() > 0 && l[0] == '>') + { + l.remove_prefix(1); + } + Literal const lit = impl->make_literal(l, datatype, std::nullopt); + impl->add_statement(subject, predicate, lit); + impl->pop_state(Node::make_null()); + } + void XMLQuadIterator::Impl::XMLLiteralState::source_input(Impl *impl) { + const xmlChar* data; + int size = 1024; + int off = 0; + xmlCtxtGetInputWindow(impl->context_.get(), 0, &data, &size, &off); + std::string_view sv{reinterpret_cast(data), static_cast(size)}; + if (literal.empty()) { + data_start = off; + } + if (!static_cast(literal).ends_with(sv)) { + last_size = literal.size(); + literal += sv; + } + last_offset = static_cast(off) + last_size; + } void XMLQuadIterator::Impl::EmptyElement::on_characters(Impl *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 032a50c3..d8e0dd33 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -416,6 +416,90 @@ TEST_CASE("rdf xml positive tests") { )"; nt = R"( "" .)"; } + SUBCASE("empty property 3") { + xml = R"( + + + + + + +)"; + nt = R"( ""^^ .)"; + } + SUBCASE("empty property 4") { + xml = R"( + + + + + + +)"; + nt = R"( _:a1 .)"; + } + SUBCASE("empty property 13") { + xml = R"( + + + + + +)"; + nt = R"( "baz" . + .)"; + } + SUBCASE("blank node identity") { + xml = R"( + + + + property value + + +)"; + nt = R"(_:j0 . +_:j0 "property value" .)"; + } + SUBCASE("blank node identity 2") { + xml = R"( + + + + + + + + + + + + + +)"; + nt = R"(_:j0A _:j0A . +_:j2 _:j1B . +_:j1B _:j0A .)"; + } +// SUBCASE("xml literal") { TODO +// xml = R"( +// +// +// +// +//
+//
+// +//
)"; +// nt = R"( "

"^^ .)"; +// } if (xml.empty()) { return; From ea0a1aea33ff1c645110f960df8f56c49f58150b Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 14 Nov 2025 14:51:30 +0100 Subject: [PATCH 13/42] cleanup --- src/rdf4cpp/parser/XMLParser.cpp | 29 +++++++++++------------------ tests/parser/tests_XMLParser.cpp | 8 +++++++- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index d832f90e..95e7501d 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -75,7 +75,7 @@ namespace rdf4cpp::parser { } template - static bool try_enter(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes, F f); + static void enter(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes, F f); static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; @@ -381,10 +381,7 @@ namespace rdf4cpp::parser { } } void XMLQuadIterator::Impl::RDFState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - if (DescriptionState::try_enter(impl, local_name, uri, attributes, [](auto) {})) { - return; - } - impl->add_error(ParsingError::Type::BadSyntax, "expected Description, found ???"); + DescriptionState::enter(impl, local_name, uri, attributes, [](auto) {}); } void XMLQuadIterator::Impl::RDFState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { impl->pop_state(Node::make_null()); @@ -435,7 +432,7 @@ namespace rdf4cpp::parser { } if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { IRI const obj = impl->try_make_iri(att.value(), base); - impl->add_statement(*sub, IRI::rdf_type(), obj); + impl->add_statement(*sub, IRI::rdf_type(impl->state_->node_storage), obj); } else { IRI const pred = impl->try_make_iri(att.uri(), att.local_name(), base); Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); @@ -450,13 +447,13 @@ namespace rdf4cpp::parser { } else if (sub.has_value()) { impl->add_statement(subject, predicate, *sub); impl->state_stack_.emplace_back(std::in_place_type_t{}); - } else if (parse_resource) { // TODO tests + } else if (parse_resource) { Node obj = impl->make_bn(std::nullopt); impl->add_statement(subject, predicate, obj); impl->state_stack_.emplace_back(std::in_place_type_t{}, obj); } else if (parse_literal) { // TODO tests - auto& s = impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); - std::visit([&](T& o) { if constexpr (std::same_as) { o.source_input(impl); }}, s); + auto& xml_state = impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); + std::visit([&](T& o) { if constexpr (std::same_as) { o.source_input(impl); }}, xml_state); } else { impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); } @@ -469,7 +466,7 @@ namespace rdf4cpp::parser { impl->pop_state(subject); } template - bool XMLQuadIterator::Impl::DescriptionState::try_enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { + void XMLQuadIterator::Impl::DescriptionState::enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { auto base = try_handle_base_attrib(impl, attributes); Node sub = Node::make_null(); auto check_only_one = [&sub, impl]() { @@ -492,7 +489,7 @@ namespace rdf4cpp::parser { std::string i = "#"; i.append(att.value()); sub = impl->try_make_iri(i, base); - } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { // TODO test case + } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; } @@ -511,7 +508,7 @@ namespace rdf4cpp::parser { } } for (auto const &att : attributes) { - if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { // TODO tests + if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { IRI const obj = impl->try_make_iri(att.value(), base); impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj); } else if (iri_equal_pieces(about_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(base_attribute, att.uri(), att.local_name())) { @@ -528,7 +525,6 @@ namespace rdf4cpp::parser { if (!base.empty()) { impl->current_state_->base = base; } - return true; } void XMLQuadIterator::Impl::PredicateState::on_characters([[maybe_unused]] Impl *impl, std::string_view const chars) { @@ -549,13 +545,10 @@ namespace rdf4cpp::parser { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found element"); return; } - if (DescriptionState::try_enter(impl, local_name, uri, attributes, [&](Node obj) { + DescriptionState::enter(impl, local_name, uri, attributes, [&](Node obj) { done = true; impl->add_statement(subject, predicate, obj); - })) { - return; - } - impl->add_error(ParsingError::Type::BadSyntax, "expected Description or literal, found element"); + }); } void XMLQuadIterator::Impl::PredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { if (!done) { diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index d8e0dd33..26649b44 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -15,7 +15,7 @@ using namespace rdf4cpp::parser; TEST_CASE("sanity test") { std::stringstream str{R"( - + example 42 not a number @@ -44,6 +44,12 @@ TEST_CASE("sanity test") { CHECK(it != std::default_sentinel); CHECK(it->has_value()); CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")); + CHECK(it->value().object() == IRI::make("https://www.example2.com/type")); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); CHECK(it->value().predicate() == IRI::make("https://www.example.com/title")); CHECK(it->value().object() == Literal::make_simple("example")); ++it; From 74a2538eeb3cd0fd138c623b525043941d40c2f3 Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 14 Nov 2025 15:58:41 +0100 Subject: [PATCH 14/42] collection --- src/rdf4cpp/parser/XMLParser.cpp | 48 +++++++++++++++++++++++++++++++- tests/parser/tests_XMLParser.cpp | 30 ++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 95e7501d..5a9275fb 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -131,6 +131,22 @@ namespace rdf4cpp::parser { void source_input(Impl *impl); }; + struct CollectionState final : BaseState { + void on_characters(Impl *impl, std::string_view chars) override; + void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + Node subject; + IRI predicate; + Node last_bn = Node::make_null(); + bool first = true; + + CollectionState(Node sub, IRI pred) : subject(sub), predicate(pred) {} + + static constexpr std::string_view iri_nil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; + static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; + static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; + }; struct EmptyElement final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -139,7 +155,7 @@ namespace rdf4cpp::parser { }; BaseState *current_state_ = nullptr; - std::vector> state_stack_; + std::vector> state_stack_; static xmlSAXHandler make_sax_handler(); @@ -454,6 +470,8 @@ namespace rdf4cpp::parser { } else if (parse_literal) { // TODO tests auto& xml_state = impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); std::visit([&](T& o) { if constexpr (std::same_as) { o.source_input(impl); }}, xml_state); + } else if (parse_collection) { + impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); } else { impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); } @@ -612,6 +630,34 @@ namespace rdf4cpp::parser { } last_offset = static_cast(off) + last_size; } + void XMLQuadIterator::Impl::CollectionState::on_characters(Impl *impl, std::string_view const chars) { + if (!trim(chars).empty()) { + impl->add_error(ParsingError::Type::BadSyntax, "expected element, found characters"); + } + } + void XMLQuadIterator::Impl::CollectionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + DescriptionState::enter(impl, local_name, uri, attributes, [&](Node const obj) { + if (first) { + first = false; + last_bn = impl->make_bn(std::nullopt); + impl->add_statement(subject, predicate, last_bn); + } else { + auto const bn = impl->make_bn(std::nullopt); + impl->add_statement(last_bn, impl->try_make_iri(iri_rest, ""), bn); + last_bn = bn; + } + impl->add_statement(last_bn, impl->try_make_iri(iri_first, ""), obj); + }); + } + void XMLQuadIterator::Impl::CollectionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + auto const nil = impl->try_make_iri(iri_nil, ""); + if (first) { + impl->add_statement(subject, predicate, nil); + } else { + impl->add_statement(last_bn, impl->try_make_iri(iri_rest, ""), nil); + } + impl->pop_state(Node::make_null()); + } void XMLQuadIterator::Impl::EmptyElement::on_characters(Impl *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 26649b44..c18bb253 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -37,6 +37,7 @@ TEST_CASE("sanity test") { blank example 2 + )"}; @@ -120,6 +121,12 @@ TEST_CASE("sanity test") { CHECK(it->value().predicate() == IRI::make("https://www.example.com/title")); CHECK(it->value().object() == Literal::make_simple("blank example 2")); ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/coll")); + CHECK(it->value().object() == IRI::make("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil")); + ++it; CHECK(it == std::default_sentinel); } @@ -493,6 +500,29 @@ _:j0 "property value" .)"; _:j2 _:j1B . _:j1B _:j0A .)"; } + SUBCASE("collection") { + xml = R"( + + + + + + + + + + +)"; + nt = R"( _:a0 . +_:a0 _:a1 . +_:a1 . +_:a1 _:a2 . +_:a2 . +_:a2 .)"; + } // SUBCASE("xml literal") { TODO // xml = R"( // Date: Wed, 19 Nov 2025 14:52:03 +0100 Subject: [PATCH 15/42] reifycation --- src/rdf4cpp/parser/XMLParser.cpp | 132 +++++++++++++++++++------------ tests/parser/tests_XMLParser.cpp | 100 +++++++++++++++++++++++ 2 files changed, 180 insertions(+), 52 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 5a9275fb..dc0beef9 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -17,6 +17,12 @@ namespace rdf4cpp::parser { size_t next_bn_index_ = 0; std::unique_ptr owned_state_; state_type *state_; + std::set reserved_ids_; // TODO faster alternative + + static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; + static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; + static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; + static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; struct Attribute { xmlChar const *local_name_raw; @@ -90,11 +96,12 @@ namespace rdf4cpp::parser { Node subject; IRI predicate; + IRI reify; std::string literal; bool done = false; - PredicateState(Node sub, IRI predicate) - : subject(sub), predicate(predicate) { + PredicateState(Node sub, IRI predicate, IRI reify) + : subject(sub), predicate(predicate), reify(reify) { } static constexpr std::string_view resource_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource"; @@ -109,8 +116,8 @@ namespace rdf4cpp::parser { IRI datatype; - TypedLiteralPredicateState(Node iri, IRI predicate, IRI datatype) - : PredicateState(iri, predicate), datatype(datatype) { + TypedLiteralPredicateState(Node iri, IRI predicate, IRI reify, IRI datatype) + : PredicateState(iri, predicate, reify), datatype(datatype) { } static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; @@ -125,9 +132,7 @@ namespace rdf4cpp::parser { size_t last_offset = 0; size_t last_size = 0; - XMLLiteralState(Node iri, IRI predicate) - : PredicateState(iri, predicate) { - } + using PredicateState::PredicateState; void source_input(Impl *impl); }; @@ -139,9 +144,10 @@ namespace rdf4cpp::parser { Node subject; IRI predicate; Node last_bn = Node::make_null(); + IRI reify; bool first = true; - CollectionState(Node sub, IRI pred) : subject(sub), predicate(pred) {} + CollectionState(Node sub, IRI pred, IRI reify) : subject(sub), predicate(pred), reify(reify) {} static constexpr std::string_view iri_nil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; @@ -167,15 +173,17 @@ namespace rdf4cpp::parser { * @param predicate * @param object */ - void add_statement(Node subject, IRI predicate, Node object); + void add_statement(Node subject, IRI predicate, Node object, IRI reify); void update_current_state(); void pop_state(Node object); static std::string_view trim(std::string_view v); static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); template NT inspect_node(NT node); - IRI try_make_iri(std::string_view iri, std::string_view base); - IRI try_make_iri(std::string_view uri, std::string_view local_name, std::string_view base); + IRI make_iri(std::string_view iri, std::string_view base); + IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base); + IRI make_id(std::string_view local_name, std::string_view base); + [[nodiscard]] IRI make_hardcoded_iri(std::string_view iri) const; Node make_bn(std::optional name); Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag); @@ -222,11 +230,17 @@ namespace rdf4cpp::parser { uint64_t const col = xmlSAX2GetColumnNumber(context_.get()); result_queue_.emplace_back(nonstd::unexpect, ty, lin, col, std::move(msg)); } - void XMLQuadIterator::Impl::add_statement(Node subject, IRI predicate, Node object) { + void XMLQuadIterator::Impl::add_statement(Node const subject, IRI const predicate, Node const object, IRI const reify) { if (subject.null() || predicate.null() || object.null()) { return; } result_queue_.emplace_back(Quad(subject, predicate, object)); + if (!reify.null()) { + result_queue_.emplace_back(Quad(reify, IRI::make_unchecked(reify_subject, state_->node_storage), subject)); + result_queue_.emplace_back(Quad(reify, IRI::make_unchecked(reify_predicate, state_->node_storage), predicate)); + result_queue_.emplace_back(Quad(reify, IRI::make_unchecked(reify_object, state_->node_storage), object)); + result_queue_.emplace_back(Quad(reify, IRI::rdf_type(state_->node_storage), IRI::make_unchecked(reify_type, state_->node_storage))); + } } void XMLQuadIterator::Impl::update_current_state() { if (state_stack_.empty()) { @@ -270,7 +284,7 @@ namespace rdf4cpp::parser { } return NT::make_null(); } - IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const iri, std::string_view base) { + IRI XMLQuadIterator::Impl::make_iri(std::string_view const iri, std::string_view const base) { if (base.empty()) { for (const auto &s : state_stack_ | std::ranges::views::reverse) { auto const v = std::visit([](const auto& s) -> std::string_view { return s.base; }, s); @@ -291,10 +305,24 @@ namespace rdf4cpp::parser { return IRI::make_null(); } } - IRI XMLQuadIterator::Impl::try_make_iri(std::string_view const uri, std::string_view const local_name, std::string_view base) { + IRI XMLQuadIterator::Impl::make_iri(std::string_view const uri, std::string_view const local_name, std::string_view const base) { std::string iri{uri}; iri.append(local_name); - return try_make_iri(iri, base); + return make_iri(iri, base); + } + IRI XMLQuadIterator::Impl::make_id(std::string_view const local_name, std::string_view const base) { + std::string local = "#"; + local.append(local_name); + auto iri = make_iri(local, base); + if (reserved_ids_.contains(iri)) { + add_error(ParsingError::Type::BadIri, std::format("{}: is already used as a rdf:ID", iri)); + return IRI::make_null(); + } + reserved_ids_.insert(iri); + return iri; + } + IRI XMLQuadIterator::Impl::make_hardcoded_iri(std::string_view const iri) const { + return IRI::make_unchecked(iri, state_->node_storage); } Node XMLQuadIterator::Impl::make_bn(std::optional name) { std::string n = ""; @@ -410,21 +438,22 @@ namespace rdf4cpp::parser { } void XMLQuadIterator::Impl::DescriptionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { auto base = try_handle_base_attrib(impl, attributes); - std::string s{uri}; - s.append(local_name); - auto predicate = impl->try_make_iri(s, base); + auto predicate = impl->make_iri(uri, local_name, base); std::optional datatype = std::nullopt; std::optional sub = std::nullopt; + IRI reify = IRI::make_null(); bool parse_resource = false; bool parse_literal = false; bool parse_collection = false; for (auto const &att : attributes) { if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { - datatype = impl->try_make_iri(att.value(), base); + datatype = impl->make_iri(att.value(), base); } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { - sub = impl->try_make_iri(att.value(), base); + sub = impl->make_iri(att.value(), base); } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { sub = impl->make_bn(att.value()); + } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + reify = impl->make_id(att.value(), base); } else if (iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { if (att.value() == PredicateState::parse_type_resource) { parse_resource = true; @@ -440,40 +469,41 @@ namespace rdf4cpp::parser { iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name()) || - iri_equal_pieces(lang_attribute, att.uri(), att.local_name())) { + iri_equal_pieces(lang_attribute, att.uri(), att.local_name()) || + iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { continue; } if (!sub.has_value()) { sub = impl->make_bn(std::nullopt); } if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = impl->try_make_iri(att.value(), base); - impl->add_statement(*sub, IRI::rdf_type(impl->state_->node_storage), obj); + IRI const obj = impl->make_iri(att.value(), base); + impl->add_statement(*sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); } else { - IRI const pred = impl->try_make_iri(att.uri(), att.local_name(), base); + IRI const pred = impl->make_iri(att.uri(), att.local_name(), base); Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); - impl->add_statement(*sub, pred, obj); + impl->add_statement(*sub, pred, obj, IRI::make_null()); } } if (sub.has_value() && (parse_collection || parse_literal || parse_resource)) { impl->add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); } if (datatype.has_value()) { - impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, *datatype); + impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify, *datatype); } else if (sub.has_value()) { - impl->add_statement(subject, predicate, *sub); + impl->add_statement(subject, predicate, *sub, reify); impl->state_stack_.emplace_back(std::in_place_type_t{}); } else if (parse_resource) { Node obj = impl->make_bn(std::nullopt); - impl->add_statement(subject, predicate, obj); + impl->add_statement(subject, predicate, obj, reify); impl->state_stack_.emplace_back(std::in_place_type_t{}, obj); } else if (parse_literal) { // TODO tests - auto& xml_state = impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); + auto& xml_state = impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify); std::visit([&](T& o) { if constexpr (std::same_as) { o.source_input(impl); }}, xml_state); } else if (parse_collection) { - impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); + impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify); } else { - impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate); + impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify); } impl->update_current_state(); if (!base.empty()) { @@ -499,14 +529,12 @@ namespace rdf4cpp::parser { if (check_only_one()) { continue; } - sub = impl->try_make_iri(att.value(), base); + sub = impl->make_iri(att.value(), base); } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; } - std::string i = "#"; - i.append(att.value()); - sub = impl->try_make_iri(i, base); + sub = impl->make_id(att.value(), base); } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; @@ -519,22 +547,22 @@ namespace rdf4cpp::parser { sub = impl->make_bn(std::nullopt); } if (!iri_equal_pieces(start_element, uri, local_name)) { - IRI const obj = impl->try_make_iri(uri, local_name, base); + IRI const obj = impl->make_iri(uri, local_name, base); if (!obj.null()) { - impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj); + impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); } } for (auto const &att : attributes) { if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = impl->try_make_iri(att.value(), base); - impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj); + IRI const obj = impl->make_iri(att.value(), base); + impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); } else if (iri_equal_pieces(about_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(base_attribute, att.uri(), att.local_name())) { continue; } else { - IRI const pred = impl->try_make_iri(att.uri(), att.local_name(), base); + IRI const pred = impl->make_iri(att.uri(), att.local_name(), base); Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); - impl->add_statement(sub, pred, obj); + impl->add_statement(sub, pred, obj, IRI::make_null()); } } f(sub); @@ -565,13 +593,13 @@ namespace rdf4cpp::parser { } DescriptionState::enter(impl, local_name, uri, attributes, [&](Node obj) { done = true; - impl->add_statement(subject, predicate, obj); + impl->add_statement(subject, predicate, obj, reify); }); } void XMLQuadIterator::Impl::PredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { if (!done) { Literal const lit = impl->make_literal(literal, std::nullopt, std::nullopt); - impl->add_statement(subject, predicate, lit); + impl->add_statement(subject, predicate, lit, reify); } impl->pop_state(Node::make_null()); } @@ -582,7 +610,7 @@ namespace rdf4cpp::parser { void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { if (!datatype.null()) { Literal const lit = impl->make_literal(literal, datatype, std::nullopt); - impl->add_statement(subject, predicate, lit); + impl->add_statement(subject, predicate, lit, reify); } impl->pop_state(Node::make_null()); } @@ -599,7 +627,7 @@ namespace rdf4cpp::parser { source_input(impl); return; } - IRI datatype = impl->try_make_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral", ""); + IRI datatype = impl->make_hardcoded_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"); std::string_view l = literal; l = l.substr(0, last_offset); l.remove_prefix(data_start); @@ -612,7 +640,7 @@ namespace rdf4cpp::parser { l.remove_prefix(1); } Literal const lit = impl->make_literal(l, datatype, std::nullopt); - impl->add_statement(subject, predicate, lit); + impl->add_statement(subject, predicate, lit, reify); impl->pop_state(Node::make_null()); } void XMLQuadIterator::Impl::XMLLiteralState::source_input(Impl *impl) { @@ -640,21 +668,21 @@ namespace rdf4cpp::parser { if (first) { first = false; last_bn = impl->make_bn(std::nullopt); - impl->add_statement(subject, predicate, last_bn); + impl->add_statement(subject, predicate, last_bn, reify); } else { auto const bn = impl->make_bn(std::nullopt); - impl->add_statement(last_bn, impl->try_make_iri(iri_rest, ""), bn); + impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_rest), bn, IRI::make_null()); last_bn = bn; } - impl->add_statement(last_bn, impl->try_make_iri(iri_first, ""), obj); + impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_first), obj, IRI::make_null()); }); } void XMLQuadIterator::Impl::CollectionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { - auto const nil = impl->try_make_iri(iri_nil, ""); + auto const nil = impl->make_hardcoded_iri(iri_nil); if (first) { - impl->add_statement(subject, predicate, nil); + impl->add_statement(subject, predicate, nil, reify); } else { - impl->add_statement(last_bn, impl->try_make_iri(iri_rest, ""), nil); + impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_rest), nil, IRI::make_null()); } impl->pop_state(Node::make_null()); } diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index c18bb253..84ce802c 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -523,6 +523,93 @@ _:a1 _:a2 . _:a2 . _:a2 .)"; } + SUBCASE("nested reify") { + xml = R"( + + + + + + + + + +)"; + nt = R"( + . + . + . + . + . + . + . + . + . + .)"; + } + SUBCASE("reify target") { + xml = R"( + + + + + +)"; + nt = R"(_:j88091 "val" . +_:j88090 _:j88091 . + _:j88090 . + . + _:j88091 . + .)"; + } + SUBCASE("reify collection") { + xml = R"( + + + + + + + + + + +)"; + nt = R"( _:a0 . +_:a0 _:a1 . + _:a0 . + . + _:a1 . + . +_:a1 . +_:a1 _:a2 . +_:a2 . +_:a2 .)"; + } + SUBCASE("reify literal") { + xml = R"( + + + + v + + +)"; + nt = R"(_:j0 "v" . + _:j0 . + . + "v" . + .)"; + } // SUBCASE("xml literal") { TODO // xml = R"( // + + + abc + + + abc + +)"; + expected_msg.emplace_back(ParsingError::Type::BadIri, ": is already used as a rdf:ID"); + ignore_some_triples = true; + } if (xml.empty()) { return; From 97327ede4ada8954ee07b34f5b30e3bc4da94bc9 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 19 Nov 2025 16:27:22 +0100 Subject: [PATCH 16/42] lang tag --- src/rdf4cpp/parser/XMLParser.cpp | 107 ++++++++++++++++++++++--------- tests/parser/tests_XMLParser.cpp | 23 +++++++ 2 files changed, 98 insertions(+), 32 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index dc0beef9..08b7dd6d 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -51,13 +51,15 @@ namespace rdf4cpp::parser { } std::string base; + std::string lang_tag; static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; static std::string_view try_handle_base_attrib(Impl* impl, std::span attributes); + static std::string_view try_handle_lang_attrib(std::span attributes); }; - struct EmptyState final : BaseState { + struct InitialState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; @@ -88,6 +90,7 @@ namespace rdf4cpp::parser { static constexpr std::string_view id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"; static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; + static constexpr std::string_view list_start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li"; }; struct PredicateState : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -109,6 +112,8 @@ namespace rdf4cpp::parser { static constexpr std::string_view parse_type_resource = "Resource"; static constexpr std::string_view parse_type_literal = "Literal"; static constexpr std::string_view parse_type_collection = "Collection"; + + static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); }; struct TypedLiteralPredicateState final : PredicateState { void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; @@ -161,7 +166,7 @@ namespace rdf4cpp::parser { }; BaseState *current_state_ = nullptr; - std::vector> state_stack_; + std::vector> state_stack_; static xmlSAXHandler make_sax_handler(); @@ -178,6 +183,7 @@ namespace rdf4cpp::parser { void pop_state(Node object); static std::string_view trim(std::string_view v); static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); + static bool iri_reserved(std::string_view uri, std::string_view local_name); template NT inspect_node(NT node); IRI make_iri(std::string_view iri, std::string_view base); @@ -270,6 +276,25 @@ namespace rdf4cpp::parser { } return full_iri.starts_with(uri) && full_iri.ends_with(local_name); } + bool XMLQuadIterator::Impl::iri_reserved(std::string_view uri, std::string_view local_name) { + static constexpr std::array reserved = { + RDFState::start_element, + DescriptionState::id_attrib, + DescriptionState::about_attrib, + PredicateState::parse_type_attrib, + PredicateState::resource_attrib, + DescriptionState::node_id_attrib, + TypedLiteralPredicateState::datatype_attrib, + BaseState::base_attribute, + BaseState::lang_attribute, + std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"), + std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"), + std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"), + }; + return std::ranges::any_of(reserved, [&](std::string_view const e) { + return iri_equal_pieces(e, uri, local_name); + }); + } template NT XMLQuadIterator::Impl::inspect_node(NT node) { try { @@ -353,12 +378,22 @@ namespace rdf4cpp::parser { try { if (datatype.has_value()) { l = Literal::make_typed(value, *datatype, state_->node_storage); - } - else if (lang_tag.has_value()) { - l = Literal::make_lang_tagged(value, *lang_tag, state_->node_storage); - } - else { - l = Literal::make_simple(value); + } else { + if (!lang_tag.has_value() || lang_tag->empty()) { + for (const auto &s : state_stack_ | std::ranges::views::reverse) { + auto const v = std::visit([](const auto& s) -> std::string_view { return s.lang_tag; }, s); + if (!v.empty()) { + lang_tag = v; + break; + } + } + } + if (lang_tag.has_value() && !lang_tag->empty()) { + l = Literal::make_lang_tagged(value, *lang_tag, state_->node_storage); + } + else { + l = Literal::make_simple(value); + } } } catch (InvalidNode const &e) { @@ -386,7 +421,7 @@ namespace rdf4cpp::parser { va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } - std::string_view XMLQuadIterator::Impl::BaseState::try_handle_base_attrib(Impl* impl, std::span attributes) { + std::string_view XMLQuadIterator::Impl::BaseState::try_handle_base_attrib(Impl* impl, std::span const attributes) { for (const auto& a : attributes) { if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) { @@ -398,24 +433,33 @@ namespace rdf4cpp::parser { } return ""; } - void XMLQuadIterator::Impl::EmptyState::on_characters(Impl *impl, std::string_view const chars) { + std::string_view XMLQuadIterator::Impl::BaseState::try_handle_lang_attrib(std::span const attributes) { + for (const auto& a : attributes) { + if (iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) + { + return a.value(); + } + } + return ""; + } + void XMLQuadIterator::Impl::InitialState::on_characters(Impl *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); } } - void XMLQuadIterator::Impl::EmptyState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { + void XMLQuadIterator::Impl::InitialState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { auto const base = try_handle_base_attrib(impl, attributes); + auto const lang = try_handle_lang_attrib(attributes); impl->state_stack_.emplace_back(std::in_place_type_t{}); impl->update_current_state(); - if (!base.empty()) { - impl->current_state_->base = base; - } + impl->current_state_->base = base; + impl->current_state_->lang_tag = lang; return; } impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); } - void XMLQuadIterator::Impl::EmptyState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + void XMLQuadIterator::Impl::InitialState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); } @@ -438,6 +482,7 @@ namespace rdf4cpp::parser { } void XMLQuadIterator::Impl::DescriptionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { auto base = try_handle_base_attrib(impl, attributes); + auto lang = try_handle_lang_attrib(attributes); auto predicate = impl->make_iri(uri, local_name, base); std::optional datatype = std::nullopt; std::optional sub = std::nullopt; @@ -465,12 +510,7 @@ namespace rdf4cpp::parser { } } for (auto const &att : attributes) { - if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name()) || - iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name()) || - iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || - iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name()) || - iri_equal_pieces(lang_attribute, att.uri(), att.local_name()) || - iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { continue; } if (!sub.has_value()) { @@ -481,7 +521,7 @@ namespace rdf4cpp::parser { impl->add_statement(*sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); } else { IRI const pred = impl->make_iri(att.uri(), att.local_name(), base); - Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); + Literal const obj = impl->make_literal(att.value(), std::nullopt, lang); impl->add_statement(*sub, pred, obj, IRI::make_null()); } } @@ -506,9 +546,8 @@ namespace rdf4cpp::parser { impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify); } impl->update_current_state(); - if (!base.empty()) { - impl->current_state_->base = base; - } + impl->current_state_->base = base; + impl->current_state_->lang_tag = lang; } void XMLQuadIterator::Impl::DescriptionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { impl->pop_state(subject); @@ -516,6 +555,7 @@ namespace rdf4cpp::parser { template void XMLQuadIterator::Impl::DescriptionState::enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { auto base = try_handle_base_attrib(impl, attributes); + auto lang = try_handle_lang_attrib(attributes); Node sub = Node::make_null(); auto check_only_one = [&sub, impl]() { if (!sub.null()) { @@ -554,23 +594,23 @@ namespace rdf4cpp::parser { } } for (auto const &att : attributes) { + if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { + continue; + } if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { IRI const obj = impl->make_iri(att.value(), base); impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); - } else if (iri_equal_pieces(about_attrib, att.uri(), att.local_name()) || iri_equal_pieces(node_id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(id_attrib, att.uri(), att.local_name()) || iri_equal_pieces(base_attribute, att.uri(), att.local_name())) { - continue; } else { IRI const pred = impl->make_iri(att.uri(), att.local_name(), base); - Literal const obj = impl->make_literal(att.value(), std::nullopt, std::nullopt); + Literal const obj = impl->make_literal(att.value(), std::nullopt, lang); impl->add_statement(sub, pred, obj, IRI::make_null()); } } f(sub); impl->state_stack_.emplace_back(std::in_place_type_t{}, sub); impl->update_current_state(); - if (!base.empty()) { - impl->current_state_->base = base; - } + impl->current_state_->base = base; + impl->current_state_->lang_tag = lang; } void XMLQuadIterator::Impl::PredicateState::on_characters([[maybe_unused]] Impl *impl, std::string_view const chars) { @@ -604,6 +644,9 @@ namespace rdf4cpp::parser { impl->pop_state(Node::make_null()); } + bool XMLQuadIterator::Impl::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { + return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(DescriptionState::list_start_element, uri, local_name); + } void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { impl->add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); } @@ -704,7 +747,7 @@ namespace rdf4cpp::parser { reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof), owned_state_(state == nullptr ? std::make_unique() : nullptr), state_(state == nullptr ? owned_state_.get() : state){ xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); - state_stack_.emplace_back(std::in_place_type_t{}); + state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); current_state_->base = IRIFactory::default_base; } diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 84ce802c..4aa4fd81 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -610,6 +610,29 @@ _:a2 "v" . .)"; } + SUBCASE("lang literal") { + xml = R"( + + + + chat + +)"; + nt = R"( "chat"@fr .)"; + } + SUBCASE("lang literal attribute") { + xml = R"( + + + + +)"; + nt = R"( "chat"@fr .)"; + } // SUBCASE("xml literal") { TODO // xml = R"( // Date: Wed, 19 Nov 2025 17:27:34 +0100 Subject: [PATCH 17/42] some cleanup --- src/rdf4cpp/parser/XMLParser.cpp | 107 +++++++++++++++---------------- 1 file changed, 52 insertions(+), 55 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 08b7dd6d..ec579b27 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -50,19 +50,27 @@ namespace rdf4cpp::parser { virtual void re_enter([[maybe_unused]] Impl *impl, [[maybe_unused]] Node obj) { // TODO remove if not needed for something } + struct InheritedAttributeInfo { + std::string_view base = ""; + std::string_view lang_tag = ""; + }; + std::string base; std::string lang_tag; + explicit BaseState(InheritedAttributeInfo const &i) : base(i.base), lang_tag(i.lang_tag) {} + static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; - static std::string_view try_handle_base_attrib(Impl* impl, std::span attributes); - static std::string_view try_handle_lang_attrib(std::span attributes); + static InheritedAttributeInfo get_inherited_attributes(Impl* impl, std::span attributes); }; struct InitialState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + InitialState() : BaseState({}) {} }; struct RDFState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -70,6 +78,8 @@ namespace rdf4cpp::parser { void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; + + using BaseState::BaseState; }; struct DescriptionState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -78,8 +88,8 @@ namespace rdf4cpp::parser { Node subject; - explicit DescriptionState(Node sub) - : subject(sub) { + explicit DescriptionState(InheritedAttributeInfo const &i, Node sub) + : BaseState(i), subject(sub) { } template @@ -103,8 +113,8 @@ namespace rdf4cpp::parser { std::string literal; bool done = false; - PredicateState(Node sub, IRI predicate, IRI reify) - : subject(sub), predicate(predicate), reify(reify) { + PredicateState(InheritedAttributeInfo const &i, Node sub, IRI predicate, IRI reify) + : BaseState(i), subject(sub), predicate(predicate), reify(reify) { } static constexpr std::string_view resource_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource"; @@ -121,8 +131,8 @@ namespace rdf4cpp::parser { IRI datatype; - TypedLiteralPredicateState(Node iri, IRI predicate, IRI reify, IRI datatype) - : PredicateState(iri, predicate, reify), datatype(datatype) { + TypedLiteralPredicateState(InheritedAttributeInfo const &i, Node iri, IRI predicate, IRI reify, IRI datatype) + : PredicateState(i, iri, predicate, reify), datatype(datatype) { } static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; @@ -152,17 +162,18 @@ namespace rdf4cpp::parser { IRI reify; bool first = true; - CollectionState(Node sub, IRI pred, IRI reify) : subject(sub), predicate(pred), reify(reify) {} + CollectionState(InheritedAttributeInfo const &i, Node sub, IRI pred, IRI reify) : BaseState(i), subject(sub), predicate(pred), reify(reify) {} static constexpr std::string_view iri_nil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; }; - struct EmptyElement final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + + EmptyElement() :BaseState({}) {} }; BaseState *current_state_ = nullptr; @@ -421,26 +432,22 @@ namespace rdf4cpp::parser { va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } - std::string_view XMLQuadIterator::Impl::BaseState::try_handle_base_attrib(Impl* impl, std::span const attributes) { + XMLQuadIterator::Impl::BaseState::InheritedAttributeInfo XMLQuadIterator::Impl::BaseState::get_inherited_attributes(Impl *impl, std::span attributes) { + InheritedAttributeInfo r{}; for (const auto& a : attributes) { if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) { - if (auto r = IRIView(a.value()).quick_validate(); r != IRIFactoryError::Ok) { - impl->add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", r, a.value())); + if (auto e = IRIView(a.value()).quick_validate(); e != IRIFactoryError::Ok) { + impl->add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", e, a.value())); } - return a.value(); + r.base = a.value(); } - } - return ""; - } - std::string_view XMLQuadIterator::Impl::BaseState::try_handle_lang_attrib(std::span const attributes) { - for (const auto& a : attributes) { - if (iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) + else if (iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) { - return a.value(); + r.lang_tag = a.value(); } } - return ""; + return r; } void XMLQuadIterator::Impl::InitialState::on_characters(Impl *impl, std::string_view const chars) { if (!trim(chars).empty()) { @@ -449,12 +456,8 @@ namespace rdf4cpp::parser { } void XMLQuadIterator::Impl::InitialState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { - auto const base = try_handle_base_attrib(impl, attributes); - auto const lang = try_handle_lang_attrib(attributes); - impl->state_stack_.emplace_back(std::in_place_type_t{}); + impl->state_stack_.emplace_back(std::in_place_type_t{}, get_inherited_attributes(impl, attributes)); impl->update_current_state(); - impl->current_state_->base = base; - impl->current_state_->lang_tag = lang; return; } impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); @@ -481,9 +484,8 @@ namespace rdf4cpp::parser { } } void XMLQuadIterator::Impl::DescriptionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { - auto base = try_handle_base_attrib(impl, attributes); - auto lang = try_handle_lang_attrib(attributes); - auto predicate = impl->make_iri(uri, local_name, base); + auto const i = get_inherited_attributes(impl, attributes); + auto predicate = impl->make_iri(uri, local_name, i.base); std::optional datatype = std::nullopt; std::optional sub = std::nullopt; IRI reify = IRI::make_null(); @@ -492,13 +494,13 @@ namespace rdf4cpp::parser { bool parse_collection = false; for (auto const &att : attributes) { if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { - datatype = impl->make_iri(att.value(), base); + datatype = impl->make_iri(att.value(), i.base); } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { - sub = impl->make_iri(att.value(), base); + sub = impl->make_iri(att.value(), i.base); } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { sub = impl->make_bn(att.value()); } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { - reify = impl->make_id(att.value(), base); + reify = impl->make_id(att.value(), i.base); } else if (iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { if (att.value() == PredicateState::parse_type_resource) { parse_resource = true; @@ -521,7 +523,7 @@ namespace rdf4cpp::parser { impl->add_statement(*sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); } else { IRI const pred = impl->make_iri(att.uri(), att.local_name(), base); - Literal const obj = impl->make_literal(att.value(), std::nullopt, lang); + Literal const obj = impl->make_literal(att.value(), std::nullopt, i.lang_tag); impl->add_statement(*sub, pred, obj, IRI::make_null()); } } @@ -529,33 +531,30 @@ namespace rdf4cpp::parser { impl->add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); } if (datatype.has_value()) { - impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify, *datatype); + impl->state_stack_.emplace_back(std::in_place_type_t{}, i, subject, predicate, reify, *datatype); } else if (sub.has_value()) { impl->add_statement(subject, predicate, *sub, reify); impl->state_stack_.emplace_back(std::in_place_type_t{}); } else if (parse_resource) { Node obj = impl->make_bn(std::nullopt); impl->add_statement(subject, predicate, obj, reify); - impl->state_stack_.emplace_back(std::in_place_type_t{}, obj); + impl->state_stack_.emplace_back(std::in_place_type_t{}, i, obj); } else if (parse_literal) { // TODO tests - auto& xml_state = impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify); + auto& xml_state = impl->state_stack_.emplace_back(std::in_place_type_t{}, i, subject, predicate, reify); std::visit([&](T& o) { if constexpr (std::same_as) { o.source_input(impl); }}, xml_state); } else if (parse_collection) { - impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify); + impl->state_stack_.emplace_back(std::in_place_type_t{}, i, subject, predicate, reify); } else { - impl->state_stack_.emplace_back(std::in_place_type_t{}, subject, predicate, reify); + impl->state_stack_.emplace_back(std::in_place_type_t{}, i, subject, predicate, reify); } impl->update_current_state(); - impl->current_state_->base = base; - impl->current_state_->lang_tag = lang; } void XMLQuadIterator::Impl::DescriptionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { impl->pop_state(subject); } template void XMLQuadIterator::Impl::DescriptionState::enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { - auto base = try_handle_base_attrib(impl, attributes); - auto lang = try_handle_lang_attrib(attributes); + auto const i = get_inherited_attributes(impl, attributes); Node sub = Node::make_null(); auto check_only_one = [&sub, impl]() { if (!sub.null()) { @@ -569,12 +568,12 @@ namespace rdf4cpp::parser { if (check_only_one()) { continue; } - sub = impl->make_iri(att.value(), base); + sub = impl->make_iri(att.value(), i.base); } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; } - sub = impl->make_id(att.value(), base); + sub = impl->make_id(att.value(), i.base); } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; @@ -587,7 +586,7 @@ namespace rdf4cpp::parser { sub = impl->make_bn(std::nullopt); } if (!iri_equal_pieces(start_element, uri, local_name)) { - IRI const obj = impl->make_iri(uri, local_name, base); + IRI const obj = impl->make_iri(uri, local_name, i.base); if (!obj.null()) { impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); @@ -598,19 +597,17 @@ namespace rdf4cpp::parser { continue; } if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = impl->make_iri(att.value(), base); + IRI const obj = impl->make_iri(att.value(), i.base); impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); } else { - IRI const pred = impl->make_iri(att.uri(), att.local_name(), base); - Literal const obj = impl->make_literal(att.value(), std::nullopt, lang); + IRI const pred = impl->make_iri(att.uri(), att.local_name(), i.base); + Literal const obj = impl->make_literal(att.value(), std::nullopt, i.lang_tag); impl->add_statement(sub, pred, obj, IRI::make_null()); } } f(sub); - impl->state_stack_.emplace_back(std::in_place_type_t{}, sub); + impl->state_stack_.emplace_back(std::in_place_type_t{}, i, sub); impl->update_current_state(); - impl->current_state_->base = base; - impl->current_state_->lang_tag = lang; } void XMLQuadIterator::Impl::PredicateState::on_characters([[maybe_unused]] Impl *impl, std::string_view const chars) { @@ -674,11 +671,11 @@ namespace rdf4cpp::parser { std::string_view l = literal; l = l.substr(0, last_offset); l.remove_prefix(data_start); - if (l.size() > 0 && l[0] == '/') + if (!l.empty() && l[0] == '/') { l.remove_prefix(1); } - if (l.size() > 0 && l[0] == '>') + if (!l.empty() && l[0] == '>') { l.remove_prefix(1); } @@ -691,7 +688,7 @@ namespace rdf4cpp::parser { int size = 1024; int off = 0; xmlCtxtGetInputWindow(impl->context_.get(), 0, &data, &size, &off); - std::string_view sv{reinterpret_cast(data), static_cast(size)}; + std::string_view const sv{reinterpret_cast(data), static_cast(size)}; if (literal.empty()) { data_start = off; } From fb86ba82cb6383ae55a4c8e9914bb23f5cfa20fe Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 20 Nov 2025 13:27:29 +0100 Subject: [PATCH 18/42] list --- src/rdf4cpp/parser/XMLParser.cpp | 13 +++- tests/parser/tests_XMLParser.cpp | 123 ++++++++++++++++++++++++++++++- 2 files changed, 131 insertions(+), 5 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index ec579b27..8f17e1b2 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -87,6 +87,7 @@ namespace rdf4cpp::parser { void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; Node subject; + size_t list_current = 1; explicit DescriptionState(InheritedAttributeInfo const &i, Node sub) : BaseState(i), subject(sub) { @@ -100,7 +101,6 @@ namespace rdf4cpp::parser { static constexpr std::string_view id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"; static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; - static constexpr std::string_view list_start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li"; }; struct PredicateState : BaseState { void on_characters(Impl *impl, std::string_view chars) override; @@ -122,6 +122,7 @@ namespace rdf4cpp::parser { static constexpr std::string_view parse_type_resource = "Resource"; static constexpr std::string_view parse_type_literal = "Literal"; static constexpr std::string_view parse_type_collection = "Collection"; + static constexpr std::string_view list_start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li"; static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); }; @@ -485,7 +486,13 @@ namespace rdf4cpp::parser { } void XMLQuadIterator::Impl::DescriptionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { auto const i = get_inherited_attributes(impl, attributes); - auto predicate = impl->make_iri(uri, local_name, i.base); + IRI predicate; + if (iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { + predicate = impl->make_iri(std::format("http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}", list_current++), i.base); + } + else { + predicate = impl->make_iri(uri, local_name, i.base); + } std::optional datatype = std::nullopt; std::optional sub = std::nullopt; IRI reify = IRI::make_null(); @@ -642,7 +649,7 @@ namespace rdf4cpp::parser { } bool XMLQuadIterator::Impl::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { - return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(DescriptionState::list_start_element, uri, local_name); + return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(list_start_element, uri, local_name); } void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { impl->add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 4aa4fd81..b072d278 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -633,6 +633,126 @@ _:a2 )"; nt = R"( "chat"@fr .)"; } + SUBCASE("simple list") { + xml = R"( + + + + 1 + 2 + +)"; + nt = R"(_:bag . +_:bag "1" . +_:bag "2" .)"; + } + SUBCASE("list interference") { + xml = R"( + + + + + _1 + 1 + _3 + 2 + +)"; + nt = R"(_:bag . +_:bag "_1" . +_:bag "1" . +_:bag "_3" . +_:bag "2" .)"; + } + SUBCASE("list advanced") { + xml = R"( + + + + + 1 + 2 + + + + + +)"; + nt = R"(_:bar . +_:bar "1" . + _:bar . + . + "1" . + . +_:bar "2"^^ . +_:bar _:res . +_:res . +_:res2 "foobar" . +_:bar _:res2 . + _:bar . + . + _:res2 . + .)"; + } + SUBCASE("list other") { + xml = R"( + + + + + barfoo + + +)"; + nt = R"( . + "3" . + "foobar" . + . + "2" . + "foobar" . + "barfoo" . +_:bag .)"; + } + SUBCASE("list independence") { + xml = R"( + + + + + + 1 + 2 + + + 2 + +)"; + nt = R"(_:d1 _:d2 . + +_:d2 "1" . +_:d2 "2" . + +_:d1 "2" .)"; + } + SUBCASE("list per element") { + xml = R"( + + + + 1 + + + + 1-again + +)"; + nt = R"( "1" . + "1-again" .)"; + } // SUBCASE("xml literal") { TODO // xml = R"( // bn_map{}; auto check = [&bn_map](Node xml, Node nt) { - CHECK(xml.is_blank_node() == nt.is_blank_node()); - if (nt.is_blank_node()) { + if (nt.is_blank_node() && xml.is_blank_node()) { auto i = bn_map.find(nt.as_blank_node()); if (i != bn_map.end()) { CHECK(xml.as_blank_node() == i->second.as_blank_node()); From 1f7f0cc9939a8d32b622b299b2394be2e6c6ed5d Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 20 Nov 2025 17:49:22 +0100 Subject: [PATCH 19/42] more cleanup --- src/rdf4cpp/parser/XMLParser.cpp | 100 ++++++++++++++++++------------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/src/rdf4cpp/parser/XMLParser.cpp index 8f17e1b2..823c92b2 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/src/rdf4cpp/parser/XMLParser.cpp @@ -2,6 +2,7 @@ #include #include +#include #include namespace rdf4cpp::parser { @@ -17,13 +18,35 @@ namespace rdf4cpp::parser { size_t next_bn_index_ = 0; std::unique_ptr owned_state_; state_type *state_; - std::set reserved_ids_; // TODO faster alternative + dice::sparse_map::sparse_set reserved_ids_; static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; + static std::string_view from_xml_char(xmlChar const * s) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s)}; + } + static std::string_view from_xml_char(xmlChar const * s, xmlChar const * e) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s), reinterpret_cast(e)}; + } + static std::string_view from_xml_char(xmlChar const * s, int const n) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s), static_cast(n)}; + } + struct Attribute { xmlChar const *local_name_raw; xmlChar const *prefix_raw; @@ -32,13 +55,13 @@ namespace rdf4cpp::parser { xmlChar const *value_end_raw; [[nodiscard]] std::string_view value() const { - return {reinterpret_cast(value_start_raw), reinterpret_cast(value_end_raw)}; + return from_xml_char(value_start_raw, value_end_raw); } [[nodiscard]] std::string_view local_name() const { - return {reinterpret_cast(local_name_raw)}; + return from_xml_char(local_name_raw); } [[nodiscard]] std::string_view uri() const { - return {reinterpret_cast(uri_raw)}; + return from_xml_char(uri_raw); } }; @@ -46,9 +69,7 @@ namespace rdf4cpp::parser { virtual ~BaseState() = default; virtual void on_characters(Impl *impl, std::string_view chars) = 0; virtual void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; - virtual void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) = 0; // TODO remove params? - virtual void re_enter([[maybe_unused]] Impl *impl, [[maybe_unused]] Node obj) { // TODO remove if not needed for something - } + virtual void on_end_element(Impl *impl) = 0; struct InheritedAttributeInfo { std::string_view base = ""; @@ -68,14 +89,14 @@ namespace rdf4cpp::parser { struct InitialState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + void on_end_element(Impl *impl) override; InitialState() : BaseState({}) {} }; struct RDFState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + void on_end_element(Impl *impl) override; static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; @@ -84,7 +105,7 @@ namespace rdf4cpp::parser { struct DescriptionState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + void on_end_element(Impl *impl) override; Node subject; size_t list_current = 1; @@ -105,7 +126,7 @@ namespace rdf4cpp::parser { struct PredicateState : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + void on_end_element(Impl *impl) override; Node subject; IRI predicate; @@ -128,7 +149,7 @@ namespace rdf4cpp::parser { }; struct TypedLiteralPredicateState final : PredicateState { void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + void on_end_element(Impl *impl) override; IRI datatype; @@ -141,7 +162,7 @@ namespace rdf4cpp::parser { struct XMLLiteralState final : PredicateState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + void on_end_element(Impl *impl) override; size_t depth = 0; size_t data_start = 0; @@ -155,7 +176,7 @@ namespace rdf4cpp::parser { struct CollectionState final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + void on_end_element(Impl *impl) override; Node subject; IRI predicate; @@ -172,7 +193,7 @@ namespace rdf4cpp::parser { struct EmptyElement final : BaseState { void on_characters(Impl *impl, std::string_view chars) override; void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl, std::string_view local_name, std::string_view uri) override; + void on_end_element(Impl *impl) override; EmptyElement() :BaseState({}) {} }; @@ -192,7 +213,7 @@ namespace rdf4cpp::parser { */ void add_statement(Node subject, IRI predicate, Node object, IRI reify); void update_current_state(); - void pop_state(Node object); + void pop_state(); static std::string_view trim(std::string_view v); static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); static bool iri_reserved(std::string_view uri, std::string_view local_name); @@ -223,20 +244,20 @@ namespace rdf4cpp::parser { r.getEntity = [](void *, xmlChar const *e) { return xmlGetPredefinedEntity(e); }; - r.characters = [](void *th, xmlChar const *e, int len) { + r.characters = [](void *th, xmlChar const *e, int const len) { auto *t = static_cast(th); - t->current_state_->on_characters(t, std::string_view(reinterpret_cast(e), static_cast(len))); + t->current_state_->on_characters(t, from_xml_char(e, len)); }; r.startElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { auto *t = static_cast(th); - t->current_state_->on_start_element(t, reinterpret_cast(local_name), uri == nullptr ? "" : reinterpret_cast(uri), + t->current_state_->on_start_element(t, from_xml_char(local_name), from_xml_char(uri), std::span{reinterpret_cast(attributes), static_cast(n_attributes)}); }; - r.endElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri) { + r.endElementNs = [](void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { auto *t = static_cast(th); - t->current_state_->on_end_element(t, reinterpret_cast(local_name), uri == nullptr ? "" : reinterpret_cast(uri)); + t->current_state_->on_end_element(t); }; r.warning = on_error; r.error = on_error; @@ -267,11 +288,10 @@ namespace rdf4cpp::parser { } current_state_ = std::visit([](auto &s) -> BaseState * { return &s; }, state_stack_.back()); } - void XMLQuadIterator::Impl::pop_state(Node object) { + void XMLQuadIterator::Impl::pop_state() { assert(!state_stack_.empty()); state_stack_.pop_back(); update_current_state(); - current_state_->re_enter(this, object); } std::string_view XMLQuadIterator::Impl::trim(std::string_view v) { auto s = v.find_first_not_of(" \t\r\n"); @@ -433,7 +453,7 @@ namespace rdf4cpp::parser { va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } - XMLQuadIterator::Impl::BaseState::InheritedAttributeInfo XMLQuadIterator::Impl::BaseState::get_inherited_attributes(Impl *impl, std::span attributes) { + XMLQuadIterator::Impl::BaseState::InheritedAttributeInfo XMLQuadIterator::Impl::BaseState::get_inherited_attributes(Impl *impl, std::span const attributes) { InheritedAttributeInfo r{}; for (const auto& a : attributes) { if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) @@ -463,7 +483,7 @@ namespace rdf4cpp::parser { } impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); } - void XMLQuadIterator::Impl::InitialState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + void XMLQuadIterator::Impl::InitialState::on_end_element(Impl *impl) { impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); } @@ -475,8 +495,8 @@ namespace rdf4cpp::parser { void XMLQuadIterator::Impl::RDFState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { DescriptionState::enter(impl, local_name, uri, attributes, [](auto) {}); } - void XMLQuadIterator::Impl::RDFState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { - impl->pop_state(Node::make_null()); + void XMLQuadIterator::Impl::RDFState::on_end_element(Impl *impl) { + impl->pop_state(); } void XMLQuadIterator::Impl::DescriptionState::on_characters(Impl *impl, std::string_view const chars) { @@ -556,8 +576,8 @@ namespace rdf4cpp::parser { } impl->update_current_state(); } - void XMLQuadIterator::Impl::DescriptionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { - impl->pop_state(subject); + void XMLQuadIterator::Impl::DescriptionState::on_end_element(Impl *impl) { + impl->pop_state(); } template void XMLQuadIterator::Impl::DescriptionState::enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { @@ -626,7 +646,7 @@ namespace rdf4cpp::parser { } literal.append(chars); } - void XMLQuadIterator::Impl::PredicateState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + void XMLQuadIterator::Impl::PredicateState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { if (!trim(literal).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element"); return; @@ -640,12 +660,12 @@ namespace rdf4cpp::parser { impl->add_statement(subject, predicate, obj, reify); }); } - void XMLQuadIterator::Impl::PredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + void XMLQuadIterator::Impl::PredicateState::on_end_element(Impl *impl) { if (!done) { Literal const lit = impl->make_literal(literal, std::nullopt, std::nullopt); impl->add_statement(subject, predicate, lit, reify); } - impl->pop_state(Node::make_null()); + impl->pop_state(); } bool XMLQuadIterator::Impl::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { @@ -654,12 +674,12 @@ namespace rdf4cpp::parser { void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { impl->add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); } - void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_end_element(Impl *impl) { if (!datatype.null()) { Literal const lit = impl->make_literal(literal, datatype, std::nullopt); impl->add_statement(subject, predicate, lit, reify); } - impl->pop_state(Node::make_null()); + impl->pop_state(); } void XMLQuadIterator::Impl::XMLLiteralState::on_characters(Impl *impl, [[maybe_unused]] std::string_view chars) { source_input(impl); @@ -668,7 +688,7 @@ namespace rdf4cpp::parser { ++depth; source_input(impl); } - void XMLQuadIterator::Impl::XMLLiteralState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + void XMLQuadIterator::Impl::XMLLiteralState::on_end_element(Impl *impl) { if (depth > 0) { --depth; source_input(impl); @@ -688,7 +708,7 @@ namespace rdf4cpp::parser { } Literal const lit = impl->make_literal(l, datatype, std::nullopt); impl->add_statement(subject, predicate, lit, reify); - impl->pop_state(Node::make_null()); + impl->pop_state(); } void XMLQuadIterator::Impl::XMLLiteralState::source_input(Impl *impl) { const xmlChar* data; @@ -724,14 +744,14 @@ namespace rdf4cpp::parser { impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_first), obj, IRI::make_null()); }); } - void XMLQuadIterator::Impl::CollectionState::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { + void XMLQuadIterator::Impl::CollectionState::on_end_element(Impl *impl) { auto const nil = impl->make_hardcoded_iri(iri_nil); if (first) { impl->add_statement(subject, predicate, nil, reify); } else { impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_rest), nil, IRI::make_null()); } - impl->pop_state(Node::make_null()); + impl->pop_state(); } void XMLQuadIterator::Impl::EmptyElement::on_characters(Impl *impl, std::string_view const chars) { if (!trim(chars).empty()) { @@ -741,8 +761,8 @@ namespace rdf4cpp::parser { void XMLQuadIterator::Impl::EmptyElement::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???"); } - void XMLQuadIterator::Impl::EmptyElement::on_end_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri) { - impl->pop_state(Node::make_null()); + void XMLQuadIterator::Impl::EmptyElement::on_end_element(Impl *impl) { + impl->pop_state(); } XMLQuadIterator::Impl::Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state) From c1eebb04896db361bb9677a5d462f4a76267367e Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 21 Nov 2025 14:30:23 +0100 Subject: [PATCH 20/42] one parser to rule them all --- CMakeLists.txt | 2 +- .../parser/IStreamQuadIteratorSerdImpl.cpp | 136 +++++------ .../parser/IStreamQuadIteratorSerdImpl.hpp | 21 +- {src => private}/rdf4cpp/parser/XMLParser.cpp | 227 +++++++++--------- src/rdf4cpp/parser/IStreamQuadIterator.cpp | 19 +- src/rdf4cpp/parser/IStreamQuadIterator.hpp | 36 ++- src/rdf4cpp/parser/ParsingFlags.hpp | 5 +- src/rdf4cpp/parser/RDFFileParser.cpp | 2 +- src/rdf4cpp/parser/XMLParser.hpp | 85 ------- tests/bench_SerDe.cpp | 1 + tests/parser/tests_IStreamQuadIterator.cpp | 2 +- tests/parser/tests_XMLParser.cpp | 10 +- 12 files changed, 243 insertions(+), 303 deletions(-) rename {src => private}/rdf4cpp/parser/XMLParser.cpp (76%) delete mode 100644 src/rdf4cpp/parser/XMLParser.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ebeb1e2..b8acdff3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,7 +131,6 @@ add_library(rdf4cpp src/rdf4cpp/namespaces/RDF.cpp src/rdf4cpp/parser/IStreamQuadIterator.cpp src/rdf4cpp/parser/RDFFileParser.cpp - src/rdf4cpp/parser/XMLParser.cpp src/rdf4cpp/query/QuadPattern.cpp src/rdf4cpp/query/Solution.cpp src/rdf4cpp/query/TriplePattern.cpp @@ -150,6 +149,7 @@ add_library(rdf4cpp src/rdf4cpp/IRIView.cpp src/rdf4cpp/IRIFactory.cpp private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp + private/rdf4cpp/parser/XMLParser.cpp private/rdf4cpp/regex/RegexImpl.cpp private/rdf4cpp/regex/RegexReplacerImpl.cpp ${serd_source_files} diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp index b5bb3f3a..b687317b 100644 --- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp +++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp @@ -7,11 +7,11 @@ namespace rdf4cpp::parser { -std::string_view IStreamQuadIterator::Impl::node_into_string_view(SerdNode const *node) noexcept { +std::string_view IStreamQuadIterator::ImplSerd::node_into_string_view(SerdNode const *node) noexcept { return std::string_view{reinterpret_cast(node->buf), node->n_bytes}; } -ParsingError::Type IStreamQuadIterator::Impl::parsing_error_type_from_serd(SerdStatus const st) noexcept { +ParsingError::Type IStreamQuadIterator::ImplSerd::parsing_error_type_from_serd(SerdStatus const st) noexcept { switch (st) { case SERD_ERR_BAD_SYNTAX: return ParsingError::Type::BadSyntax; @@ -26,13 +26,13 @@ ParsingError::Type IStreamQuadIterator::Impl::parsing_error_type_from_serd(SerdS } } -nonstd::expected IStreamQuadIterator::Impl::get_bnode(std::string &&graph_str, SerdNode const *node) noexcept { +nonstd::expected IStreamQuadIterator::ImplSerd::get_bnode(std::string &&graph_str, SerdNode const *node) noexcept { auto const node_str = node_into_string_view(node); if (this->flags.contains(ParsingFlag::NoParseBlankNode)) { this->last_error = ParsingError{.error_type = ParsingError::Type::BadSyntax, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = "Encountered blank node while parsing. hint: blank nodes are not allowed in the current document. note: position may not be accurate and instead point to the end of the line."}; return nonstd::make_unexpected(SERD_ERR_BAD_SYNTAX); @@ -47,22 +47,22 @@ nonstd::expected IStreamQuadIterator::Impl::get_bnode(std::str } catch (InvalidNode const &e) { // NOTE: line, col not entirely accurate as this function is called after a triple was parsed this->last_error = ParsingError{.error_type = ParsingError::Type::BadBlankNode, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = std::string{e.what()} + ". note: position may not be accurate and instead point to the end of the triple."}; return nonstd::make_unexpected(SERD_ERR_BAD_SYNTAX); } catch (...) { this->last_error = ParsingError{.error_type = ParsingError::Type::BadBlankNode, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = "Unknown internal error. note: position may not be accurate and instead point to the end of the triple."}; return nonstd::make_unexpected(SERD_ERR_BAD_SYNTAX); } } -nonstd::expected IStreamQuadIterator::Impl::get_iri(SerdNode const *node) noexcept { +nonstd::expected IStreamQuadIterator::ImplSerd::get_iri(SerdNode const *node) noexcept { auto const iri = [this, node]() noexcept { auto const s = node_into_string_view(node); @@ -76,8 +76,8 @@ nonstd::expected IStreamQuadIterator::Impl::get_iri(SerdNode co if (!iri.has_value()) { IRIFactoryError err = iri.error(); this->last_error = ParsingError{.error_type = ParsingError::Type::BadIri, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = std::format("invalid iri. {}. note: position may not be accurate and instead point to the end of the triple.", err)}; return nonstd::make_unexpected(SERD_ERR_BAD_SYNTAX); @@ -86,11 +86,11 @@ nonstd::expected IStreamQuadIterator::Impl::get_iri(SerdNode co return *iri; } -nonstd::expected IStreamQuadIterator::Impl::get_prefixed_iri(SerdNode const *node) noexcept { +nonstd::expected IStreamQuadIterator::ImplSerd::get_prefixed_iri(SerdNode const *node) noexcept { if (!flags.syntax_allows_prefixes()) [[unlikely]] { this->last_error = ParsingError{.error_type = ParsingError::Type::BadSyntax, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = "Encountered prefix while parsing. hint: prefixes are not allowed in the current document. note: position may not be accurate and instead point to the end of the line."}; return nonstd::make_unexpected(SERD_ERR_BAD_SYNTAX); @@ -112,15 +112,15 @@ nonstd::expected IStreamQuadIterator::Impl::get_prefixed_iri(Se if (err == IRIFactoryError::UnknownPrefix) { // NOTE: line, col not entirely accurate as this function is called after a triple was parsed this->last_error = ParsingError{.error_type = ParsingError::Type::BadCurie, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = "unknown prefix. note: position may not be accurate and instead point to the end of the triple."}; return nonstd::make_unexpected(SERD_ERR_BAD_CURIE); } else { this->last_error = ParsingError{.error_type = ParsingError::Type::BadIri, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = std::format("unable to expand curie into valid iri. {}. note: position may not be accurate and instead point to the end of the triple.", err)}; return nonstd::make_unexpected(SERD_ERR_BAD_SYNTAX); @@ -130,7 +130,7 @@ nonstd::expected IStreamQuadIterator::Impl::get_prefixed_iri(Se return *iri; } -nonstd::expected IStreamQuadIterator::Impl::get_literal(SerdNode const *literal, SerdNode const *datatype, SerdNode const *lang) noexcept { +nonstd::expected IStreamQuadIterator::ImplSerd::get_literal(SerdNode const *literal, SerdNode const *datatype, SerdNode const *lang) noexcept { auto const literal_value = node_into_string_view(literal); auto const datatype_iri = [&]() -> std::optional> { @@ -163,15 +163,15 @@ nonstd::expected IStreamQuadIterator::Impl::get_literal(Ser } catch (InvalidNode const &e) { // NOTE: line, col not entirely accurate as this function is called after a triple was parsed this->last_error = ParsingError{.error_type = ParsingError::Type::BadLiteral, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = std::string{e.what()} + ". note: position may not be accurate and instead point to the end of the triple."}; return nonstd::make_unexpected(SERD_ERR_BAD_SYNTAX); } catch (...) { this->last_error = ParsingError{.error_type = ParsingError::Type::BadLiteral, - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = "Unknown internal error. note: position may not be accurate and instead point to the end of the triple."}; return nonstd::make_unexpected(SERD_ERR_BAD_SYNTAX); @@ -200,8 +200,8 @@ nonstd::expected IStreamQuadIterator::Impl::get_literal(Ser return SERD_SUCCESS; } -SerdStatus IStreamQuadIterator::Impl::on_error(void *voided_self, SerdError const *error) noexcept { - auto *self = static_cast(voided_self); +SerdStatus IStreamQuadIterator::ImplSerd::on_error(void *voided_self, SerdError const *error) noexcept { + auto *self = static_cast(voided_self); size_t buf_size; SerdStatus const st = calc_required_buffer_size(error, buf_size); @@ -233,37 +233,37 @@ SerdStatus IStreamQuadIterator::Impl::on_error(void *voided_self, SerdError cons return SERD_SUCCESS; } -SerdStatus IStreamQuadIterator::Impl::on_base(void *voided_self, const SerdNode *uri) noexcept { - auto *self = static_cast(voided_self); +SerdStatus IStreamQuadIterator::ImplSerd::on_base(void *voided_self, const SerdNode *uri) noexcept { + auto *self = static_cast(voided_self); if (self->flags.contains(ParsingFlag::NoParsePrefix)) { self->last_error = ParsingError{.error_type = ParsingError::Type::BadSyntax, - .line = serd_reader_get_current_line(self->reader), - .col = serd_reader_get_current_col(self->reader), + .line = serd_reader_get_current_line(self->reader.get()), + .col = serd_reader_get_current_col(self->reader.get()), .message = "Encountered base while parsing. hint: bases are not allowed in the current document. note: position may not be accurate and instead point to the end of the line."}; } else if (auto e = self->state->iri_factory.set_base(node_into_string_view(uri)); e != IRIFactoryError::Ok) { self->last_error = ParsingError{.error_type = ParsingError::Type::BadSyntax, - .line = serd_reader_get_current_line(self->reader), - .col = serd_reader_get_current_col(self->reader), + .line = serd_reader_get_current_line(self->reader.get()), + .col = serd_reader_get_current_col(self->reader.get()), .message = std::format("Error setting base: {}. note: position may not be accurate and instead point to the end of the line.", e)}; } return SERD_SUCCESS; } -SerdStatus IStreamQuadIterator::Impl::on_prefix(void *voided_self, SerdNode const *name, SerdNode const *uri) noexcept { - auto *self = static_cast(voided_self); +SerdStatus IStreamQuadIterator::ImplSerd::on_prefix(void *voided_self, SerdNode const *name, SerdNode const *uri) noexcept { + auto *self = static_cast(voided_self); if (self->flags.contains(ParsingFlag::NoParsePrefix)) { self->last_error = ParsingError{.error_type = ParsingError::Type::BadSyntax, - .line = serd_reader_get_current_line(self->reader), - .col = serd_reader_get_current_col(self->reader), + .line = serd_reader_get_current_line(self->reader.get()), + .col = serd_reader_get_current_col(self->reader.get()), .message = "Encountered prefix while parsing. hint: prefixes are not allowed in the current document. note: position may not be accurate and instead point to the end of the line."}; } else { if (self->state->iri_factory.assign_prefix(node_into_string_view(name), node_into_string_view(uri)) != IRIFactoryError::Ok) { self->last_error = ParsingError{.error_type = ParsingError::Type::BadSyntax, - .line = serd_reader_get_current_line(self->reader), - .col = serd_reader_get_current_col(self->reader), + .line = serd_reader_get_current_line(self->reader.get()), + .col = serd_reader_get_current_col(self->reader.get()), .message = std::format("Invalid prefix: {}. note: position may not be accurate and instead point to the end of the line.", node_into_string_view(name))}; } } @@ -271,27 +271,27 @@ SerdStatus IStreamQuadIterator::Impl::on_prefix(void *voided_self, SerdNode cons return SERD_SUCCESS; } -SerdStatus IStreamQuadIterator::Impl::inspect_node(Node const &node) noexcept { +SerdStatus IStreamQuadIterator::ImplSerd::inspect_node(Node const &node) noexcept { try { state->inspect_node_func(node); return SERD_SUCCESS; } catch (std::exception const &e) { // skip last_error = ParsingError{.error_type = ParsingError::Type::BadSyntax, - .line = serd_reader_get_current_line(reader), - .col = serd_reader_get_current_col(reader), + .line = serd_reader_get_current_line(reader.get()), + .col = serd_reader_get_current_col(reader.get()), .message = std::format("Triple explicitly skipped by inspect function: {}", e.what())}; } catch (...) { last_error = ParsingError{.error_type = ParsingError::Type::BadSyntax, - .line = serd_reader_get_current_line(reader), - .col = serd_reader_get_current_col(reader), + .line = serd_reader_get_current_line(reader.get()), + .col = serd_reader_get_current_col(reader.get()), .message = "Triple explicitly skipped by inspect function"}; } return SERD_FAILURE; } -SerdStatus IStreamQuadIterator::Impl::on_stmt(void *voided_self, +SerdStatus IStreamQuadIterator::ImplSerd::on_stmt(void *voided_self, SerdStatementFlags, SerdNode const *graph, SerdNode const *subj, @@ -300,7 +300,7 @@ SerdStatus IStreamQuadIterator::Impl::on_stmt(void *voided_self, SerdNode const *obj_datatype, SerdNode const *obj_lang) noexcept { - auto *self = static_cast(voided_self); + auto *self = static_cast(voided_self); auto const graph_node = [&]() -> nonstd::expected { if (graph != nullptr) { @@ -394,42 +394,28 @@ SerdStatus IStreamQuadIterator::Impl::on_stmt(void *voided_self, return SERD_SUCCESS; } -IStreamQuadIterator::Impl::Impl(void *stream, +IStreamQuadIterator::ImplSerd::ImplSerd(void *stream, ReadFunc read, ErrorFunc error, flags_type flags, state_type *initial_state) noexcept - : reader{serd_reader_new(extract_syntax_from_flags(flags), this, nullptr, &Impl::on_base, &Impl::on_prefix, &Impl::on_stmt, nullptr)}, - state{initial_state}, - state_is_owned{false}, + : reader{serd_reader_new(extract_syntax_from_flags(flags), this, nullptr, &ImplSerd::on_base, &ImplSerd::on_prefix, &ImplSerd::on_stmt, nullptr)}, + state_owned(initial_state == nullptr ? std::make_unique() : nullptr), + state{initial_state == nullptr ? state_owned.get() : initial_state}, flags{flags} { - if (this->state == nullptr) { - this->state = new state_type{}; - this->state_is_owned = true; - } - - serd_reader_set_strict(this->reader, !flags.contains(ParsingFlag::Lax)); - serd_reader_set_error_sink(this->reader, &Impl::on_error, this); - serd_reader_start_source_stream(this->reader, read, error, stream, nullptr, 4096); -} - -IStreamQuadIterator::Impl::~Impl() noexcept { - serd_reader_end_stream(this->reader); - serd_reader_free(this->reader); - - if (this->state_is_owned) { - delete this->state; - } + serd_reader_set_strict(this->reader.get(), !flags.contains(ParsingFlag::Lax)); + serd_reader_set_error_sink(this->reader.get(), &ImplSerd::on_error, this); + serd_reader_start_source_stream(this->reader.get(), read, error, stream, nullptr, 4096); } -std::optional> IStreamQuadIterator::Impl::next() { +std::optional> IStreamQuadIterator::ImplSerd::next() { while (this->quad_buffer.empty()) { if (this->last_error.has_value()) { // handle error from last time if (this->last_error_requires_skip) { this->last_error_requires_skip = false; - if (serd_reader_skip_until_byte(this->reader, '\n') != SERD_SUCCESS) { + if (serd_reader_skip_until_byte(this->reader.get(), '\n') != SERD_SUCCESS) { // EOF reached this->end_flag = true; } @@ -439,7 +425,7 @@ std::optionalreader); + SerdStatus const st = serd_reader_read_chunk(this->reader.get()); if (st == SERD_SUCCESS) { // was able to parse something @@ -455,8 +441,8 @@ std::optional not eof // but we don't really know what because the error handler was not called this->last_error = ParsingError{.error_type = parsing_error_type_from_serd(st), - .line = serd_reader_get_current_line(this->reader), - .col = serd_reader_get_current_col(this->reader), + .line = serd_reader_get_current_line(this->reader.get()), + .col = serd_reader_get_current_col(this->reader.get()), .message = "Unknown error"}; this->last_error_requires_skip = true; } @@ -468,12 +454,12 @@ std::optionalreader); +uint64_t IStreamQuadIterator::ImplSerd::current_line() const noexcept { + return serd_reader_get_current_line(this->reader.get()); } -uint64_t IStreamQuadIterator::Impl::current_column() const noexcept { - return serd_reader_get_current_col(this->reader); +uint64_t IStreamQuadIterator::ImplSerd::current_column() const noexcept { + return serd_reader_get_current_col(this->reader.get()); } } // namespace rdf4cpp::parser diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp index aca0acc4..fe605da0 100644 --- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp +++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp @@ -14,17 +14,20 @@ namespace rdf4cpp::parser { -struct IStreamQuadIterator::Impl { +struct IStreamQuadIterator::ImplSerd final : Impl { using flags_type = IStreamQuadIterator::flags_type; using state_type = IStreamQuadIterator::state_type; using ok_type = IStreamQuadIterator::ok_type; using error_type = IStreamQuadIterator::error_type; private: - SerdReader *reader; + std::unique_ptr reader; + std::unique_ptr state_owned = nullptr; state_type *state; - bool state_is_owned; std::deque quad_buffer; std::optional last_error; @@ -33,11 +36,9 @@ struct IStreamQuadIterator::Impl { flags_type flags; -private: static std::string_view node_into_string_view(SerdNode const *node) noexcept; static ParsingError::Type parsing_error_type_from_serd(SerdStatus st) noexcept; -private: nonstd::expected get_bnode(std::string &&graph_str, SerdNode const *node) noexcept; nonstd::expected get_iri(SerdNode const *node) noexcept; nonstd::expected get_prefixed_iri(SerdNode const *node) noexcept; @@ -63,13 +64,13 @@ struct IStreamQuadIterator::Impl { } public: - Impl(void *stream, + ImplSerd(void *stream, ReadFunc read, ErrorFunc, flags_type flags, state_type *state) noexcept; - ~Impl() noexcept; + ~ImplSerd() override = default; /** * Tries to extract the next element from the serd backend. @@ -81,10 +82,10 @@ struct IStreamQuadIterator::Impl { * expected Quad: if there was a next element and it could be parsed * unexpected ParsingError: if there was a next element but it could not be parsed */ - [[nodiscard]] std::optional> next(); + [[nodiscard]] std::optional> next() override; - [[nodiscard]] uint64_t current_line() const noexcept; - [[nodiscard]] uint64_t current_column() const noexcept; + [[nodiscard]] uint64_t current_line() const noexcept override; + [[nodiscard]] uint64_t current_column() const noexcept override; }; } // namespace rdf4cpp::parser diff --git a/src/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp similarity index 76% rename from src/rdf4cpp/parser/XMLParser.cpp rename to private/rdf4cpp/parser/XMLParser.cpp index 823c92b2..82d41c72 100644 --- a/src/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -1,12 +1,25 @@ -#include "XMLParser.hpp" - +#include +#include #include + +#include + +#include + +#include +#include +#include +#include +#include + +#include + #include + #include -#include namespace rdf4cpp::parser { - struct XMLQuadIterator::Impl { + struct IStreamQuadIterator::ImplXML final : Impl { private: xmlSAXHandler handler_; std::unique_ptr context_; @@ -67,9 +80,9 @@ namespace rdf4cpp::parser { struct BaseState { // NOLINT(*-special-member-functions) virtual ~BaseState() = default; - virtual void on_characters(Impl *impl, std::string_view chars) = 0; - virtual void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; - virtual void on_end_element(Impl *impl) = 0; + virtual void on_characters(ImplXML *impl, std::string_view chars) = 0; + virtual void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; + virtual void on_end_element(ImplXML *impl) = 0; struct InheritedAttributeInfo { std::string_view base = ""; @@ -83,29 +96,29 @@ namespace rdf4cpp::parser { static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; - static InheritedAttributeInfo get_inherited_attributes(Impl* impl, std::span attributes); + static InheritedAttributeInfo get_inherited_attributes(ImplXML *impl, std::span attributes); }; struct InitialState final : BaseState { - void on_characters(Impl *impl, std::string_view chars) override; - void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl) override; + void on_characters(ImplXML *impl, std::string_view chars) override; + void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML *impl) override; InitialState() : BaseState({}) {} }; struct RDFState final : BaseState { - void on_characters(Impl *impl, std::string_view chars) override; - void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl) override; + void on_characters(ImplXML *impl, std::string_view chars) override; + void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML *impl) override; static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; using BaseState::BaseState; }; struct DescriptionState final : BaseState { - void on_characters(Impl *impl, std::string_view chars) override; - void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl) override; + void on_characters(ImplXML *impl, std::string_view chars) override; + void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML *impl) override; Node subject; size_t list_current = 1; @@ -115,7 +128,7 @@ namespace rdf4cpp::parser { } template - static void enter(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes, F f); + static void enter(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes, F f); static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; @@ -124,9 +137,9 @@ namespace rdf4cpp::parser { static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; }; struct PredicateState : BaseState { - void on_characters(Impl *impl, std::string_view chars) override; - void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl) override; + void on_characters(ImplXML *impl, std::string_view chars) override; + void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML *impl) override; Node subject; IRI predicate; @@ -148,8 +161,8 @@ namespace rdf4cpp::parser { static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); }; struct TypedLiteralPredicateState final : PredicateState { - void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl) override; + void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML *impl) override; IRI datatype; @@ -160,9 +173,9 @@ namespace rdf4cpp::parser { static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; }; struct XMLLiteralState final : PredicateState { - void on_characters(Impl *impl, std::string_view chars) override; - void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl) override; + void on_characters(ImplXML *impl, std::string_view chars) override; + void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML *impl) override; size_t depth = 0; size_t data_start = 0; @@ -171,12 +184,12 @@ namespace rdf4cpp::parser { using PredicateState::PredicateState; - void source_input(Impl *impl); + void source_input(ImplXML *impl); }; struct CollectionState final : BaseState { - void on_characters(Impl *impl, std::string_view chars) override; - void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl) override; + void on_characters(ImplXML *impl, std::string_view chars) override; + void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML *impl) override; Node subject; IRI predicate; @@ -191,9 +204,9 @@ namespace rdf4cpp::parser { static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; }; struct EmptyElement final : BaseState { - void on_characters(Impl *impl, std::string_view chars) override; - void on_start_element(Impl *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(Impl *impl) override; + void on_characters(ImplXML *impl, std::string_view chars) override; + void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML *impl) override; EmptyElement() :BaseState({}) {} }; @@ -229,12 +242,16 @@ namespace rdf4cpp::parser { static void on_error(void *th, char const *msg, ...); public: - explicit Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state); + ImplXML(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state); - std::optional next(); + [[nodiscard]] std::optional next() override; + + [[nodiscard]] uint64_t current_line() const noexcept override; + [[nodiscard]] uint64_t current_column() const noexcept override; }; - xmlSAXHandler XMLQuadIterator::Impl::make_sax_handler() { + + xmlSAXHandler IStreamQuadIterator::ImplXML::make_sax_handler() { xmlSAXHandler r{}; std::memset(&r, 0, sizeof(xmlSAXHandler)); r.initialized = XML_SAX2_MAGIC; @@ -245,18 +262,18 @@ namespace rdf4cpp::parser { return xmlGetPredefinedEntity(e); }; r.characters = [](void *th, xmlChar const *e, int const len) { - auto *t = static_cast(th); + auto *t = static_cast(th); t->current_state_->on_characters(t, from_xml_char(e, len)); }; r.startElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { - auto *t = static_cast(th); + auto *t = static_cast(th); t->current_state_->on_start_element(t, from_xml_char(local_name), from_xml_char(uri), std::span{reinterpret_cast(attributes), static_cast(n_attributes)}); }; r.endElementNs = [](void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { - auto *t = static_cast(th); + auto *t = static_cast(th); t->current_state_->on_end_element(t); }; r.warning = on_error; @@ -264,12 +281,12 @@ namespace rdf4cpp::parser { return r; } - void XMLQuadIterator::Impl::add_error(ParsingError::Type const ty, std::string msg) { + void IStreamQuadIterator::ImplXML::add_error(ParsingError::Type const ty, std::string msg) { uint64_t const lin = xmlSAX2GetLineNumber(context_.get()); uint64_t const col = xmlSAX2GetColumnNumber(context_.get()); result_queue_.emplace_back(nonstd::unexpect, ty, lin, col, std::move(msg)); } - void XMLQuadIterator::Impl::add_statement(Node const subject, IRI const predicate, Node const object, IRI const reify) { + void IStreamQuadIterator::ImplXML::add_statement(Node const subject, IRI const predicate, Node const object, IRI const reify) { if (subject.null() || predicate.null() || object.null()) { return; } @@ -281,19 +298,19 @@ namespace rdf4cpp::parser { result_queue_.emplace_back(Quad(reify, IRI::rdf_type(state_->node_storage), IRI::make_unchecked(reify_type, state_->node_storage))); } } - void XMLQuadIterator::Impl::update_current_state() { + void IStreamQuadIterator::ImplXML::update_current_state() { if (state_stack_.empty()) { current_state_ = nullptr; return; } current_state_ = std::visit([](auto &s) -> BaseState * { return &s; }, state_stack_.back()); } - void XMLQuadIterator::Impl::pop_state() { + void IStreamQuadIterator::ImplXML::pop_state() { assert(!state_stack_.empty()); state_stack_.pop_back(); update_current_state(); } - std::string_view XMLQuadIterator::Impl::trim(std::string_view v) { + std::string_view IStreamQuadIterator::ImplXML::trim(std::string_view v) { auto s = v.find_first_not_of(" \t\r\n"); if (s == std::string_view::npos) { return ""; @@ -302,13 +319,13 @@ namespace rdf4cpp::parser { // ReSharper disable once CppDFALocalValueEscapesFunction return v; } - bool XMLQuadIterator::Impl::iri_equal_pieces(std::string_view const full_iri, std::string_view const uri, std::string_view const local_name) { + bool IStreamQuadIterator::ImplXML::iri_equal_pieces(std::string_view const full_iri, std::string_view const uri, std::string_view const local_name) { if (full_iri.size() != local_name.size() + uri.size()) { return false; } return full_iri.starts_with(uri) && full_iri.ends_with(local_name); } - bool XMLQuadIterator::Impl::iri_reserved(std::string_view uri, std::string_view local_name) { + bool IStreamQuadIterator::ImplXML::iri_reserved(std::string_view uri, std::string_view local_name) { static constexpr std::array reserved = { RDFState::start_element, DescriptionState::id_attrib, @@ -328,7 +345,7 @@ namespace rdf4cpp::parser { }); } template - NT XMLQuadIterator::Impl::inspect_node(NT node) { + NT IStreamQuadIterator::ImplXML::inspect_node(NT node) { try { state_->inspect_node_func(node); return node; @@ -341,7 +358,7 @@ namespace rdf4cpp::parser { } return NT::make_null(); } - IRI XMLQuadIterator::Impl::make_iri(std::string_view const iri, std::string_view const base) { + IRI IStreamQuadIterator::ImplXML::make_iri(std::string_view const iri, std::string_view const base) { if (base.empty()) { for (const auto &s : state_stack_ | std::ranges::views::reverse) { auto const v = std::visit([](const auto& s) -> std::string_view { return s.base; }, s); @@ -362,12 +379,12 @@ namespace rdf4cpp::parser { return IRI::make_null(); } } - IRI XMLQuadIterator::Impl::make_iri(std::string_view const uri, std::string_view const local_name, std::string_view const base) { + IRI IStreamQuadIterator::ImplXML::make_iri(std::string_view const uri, std::string_view const local_name, std::string_view const base) { std::string iri{uri}; iri.append(local_name); return make_iri(iri, base); } - IRI XMLQuadIterator::Impl::make_id(std::string_view const local_name, std::string_view const base) { + IRI IStreamQuadIterator::ImplXML::make_id(std::string_view const local_name, std::string_view const base) { std::string local = "#"; local.append(local_name); auto iri = make_iri(local, base); @@ -378,10 +395,10 @@ namespace rdf4cpp::parser { reserved_ids_.insert(iri); return iri; } - IRI XMLQuadIterator::Impl::make_hardcoded_iri(std::string_view const iri) const { + IRI IStreamQuadIterator::ImplXML::make_hardcoded_iri(std::string_view const iri) const { return IRI::make_unchecked(iri, state_->node_storage); } - Node XMLQuadIterator::Impl::make_bn(std::optional name) { + Node IStreamQuadIterator::ImplXML::make_bn(std::optional name) { std::string n = ""; if (!name.has_value()) { n = std::format("bn_{}", next_bn_index_++); @@ -405,7 +422,7 @@ namespace rdf4cpp::parser { return BlankNode::make_null(); } } - Literal XMLQuadIterator::Impl::make_literal(std::string_view value, std::optional datatype, std::optional lang_tag) { + Literal IStreamQuadIterator::ImplXML::make_literal(std::string_view value, std::optional datatype, std::optional lang_tag) { Literal l = Literal::make_null(); try { if (datatype.has_value()) { @@ -436,9 +453,9 @@ namespace rdf4cpp::parser { } return inspect_node(l); } - void XMLQuadIterator::Impl::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) + void IStreamQuadIterator::ImplXML::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) va_list args; - auto t = static_cast(th); + auto t = static_cast(th); va_start(args, msg); // NOLINT(*-pro-bounds-array-to-pointer-decay) std::string out{}; out.resize(1024, '\0'); @@ -453,7 +470,7 @@ namespace rdf4cpp::parser { va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } - XMLQuadIterator::Impl::BaseState::InheritedAttributeInfo XMLQuadIterator::Impl::BaseState::get_inherited_attributes(Impl *impl, std::span const attributes) { + IStreamQuadIterator::ImplXML::BaseState::InheritedAttributeInfo IStreamQuadIterator::ImplXML::BaseState::get_inherited_attributes(ImplXML *impl, std::span const attributes) { InheritedAttributeInfo r{}; for (const auto& a : attributes) { if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) @@ -470,12 +487,12 @@ namespace rdf4cpp::parser { } return r; } - void XMLQuadIterator::Impl::InitialState::on_characters(Impl *impl, std::string_view const chars) { + void IStreamQuadIterator::ImplXML::InitialState::on_characters(ImplXML *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); } } - void XMLQuadIterator::Impl::InitialState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { + void IStreamQuadIterator::ImplXML::InitialState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { impl->state_stack_.emplace_back(std::in_place_type_t{}, get_inherited_attributes(impl, attributes)); impl->update_current_state(); @@ -483,28 +500,28 @@ namespace rdf4cpp::parser { } impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); } - void XMLQuadIterator::Impl::InitialState::on_end_element(Impl *impl) { + void IStreamQuadIterator::ImplXML::InitialState::on_end_element(ImplXML *impl) { impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); } - void XMLQuadIterator::Impl::RDFState::on_characters(Impl *impl, std::string_view const chars) { + void IStreamQuadIterator::ImplXML::RDFState::on_characters(ImplXML *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected Description, found characters"); } } - void XMLQuadIterator::Impl::RDFState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + void IStreamQuadIterator::ImplXML::RDFState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { DescriptionState::enter(impl, local_name, uri, attributes, [](auto) {}); } - void XMLQuadIterator::Impl::RDFState::on_end_element(Impl *impl) { + void IStreamQuadIterator::ImplXML::RDFState::on_end_element(ImplXML *impl) { impl->pop_state(); } - void XMLQuadIterator::Impl::DescriptionState::on_characters(Impl *impl, std::string_view const chars) { + void IStreamQuadIterator::ImplXML::DescriptionState::on_characters(ImplXML *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters"); } } - void XMLQuadIterator::Impl::DescriptionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { + void IStreamQuadIterator::ImplXML::DescriptionState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { auto const i = get_inherited_attributes(impl, attributes); IRI predicate; if (iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { @@ -576,11 +593,11 @@ namespace rdf4cpp::parser { } impl->update_current_state(); } - void XMLQuadIterator::Impl::DescriptionState::on_end_element(Impl *impl) { + void IStreamQuadIterator::ImplXML::DescriptionState::on_end_element(ImplXML *impl) { impl->pop_state(); } template - void XMLQuadIterator::Impl::DescriptionState::enter(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { + void IStreamQuadIterator::ImplXML::DescriptionState::enter(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { auto const i = get_inherited_attributes(impl, attributes); Node sub = Node::make_null(); auto check_only_one = [&sub, impl]() { @@ -637,7 +654,7 @@ namespace rdf4cpp::parser { impl->update_current_state(); } - void XMLQuadIterator::Impl::PredicateState::on_characters([[maybe_unused]] Impl *impl, std::string_view const chars) { + void IStreamQuadIterator::ImplXML::PredicateState::on_characters([[maybe_unused]] ImplXML *impl, std::string_view const chars) { if (done) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal"); @@ -646,7 +663,7 @@ namespace rdf4cpp::parser { } literal.append(chars); } - void XMLQuadIterator::Impl::PredicateState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + void IStreamQuadIterator::ImplXML::PredicateState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { if (!trim(literal).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element"); return; @@ -660,7 +677,7 @@ namespace rdf4cpp::parser { impl->add_statement(subject, predicate, obj, reify); }); } - void XMLQuadIterator::Impl::PredicateState::on_end_element(Impl *impl) { + void IStreamQuadIterator::ImplXML::PredicateState::on_end_element(ImplXML *impl) { if (!done) { Literal const lit = impl->make_literal(literal, std::nullopt, std::nullopt); impl->add_statement(subject, predicate, lit, reify); @@ -668,27 +685,27 @@ namespace rdf4cpp::parser { impl->pop_state(); } - bool XMLQuadIterator::Impl::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { + bool IStreamQuadIterator::ImplXML::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(list_start_element, uri, local_name); } - void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_start_element(ImplXML *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { impl->add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); } - void XMLQuadIterator::Impl::TypedLiteralPredicateState::on_end_element(Impl *impl) { + void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_end_element(ImplXML *impl) { if (!datatype.null()) { Literal const lit = impl->make_literal(literal, datatype, std::nullopt); impl->add_statement(subject, predicate, lit, reify); } impl->pop_state(); } - void XMLQuadIterator::Impl::XMLLiteralState::on_characters(Impl *impl, [[maybe_unused]] std::string_view chars) { + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_characters(ImplXML *impl, [[maybe_unused]] std::string_view chars) { source_input(impl); } - void XMLQuadIterator::Impl::XMLLiteralState::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_start_element(ImplXML *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { ++depth; source_input(impl); } - void XMLQuadIterator::Impl::XMLLiteralState::on_end_element(Impl *impl) { + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_end_element(ImplXML *impl) { if (depth > 0) { --depth; source_input(impl); @@ -710,7 +727,7 @@ namespace rdf4cpp::parser { impl->add_statement(subject, predicate, lit, reify); impl->pop_state(); } - void XMLQuadIterator::Impl::XMLLiteralState::source_input(Impl *impl) { + void IStreamQuadIterator::ImplXML::XMLLiteralState::source_input(ImplXML *impl) { const xmlChar* data; int size = 1024; int off = 0; @@ -725,12 +742,12 @@ namespace rdf4cpp::parser { } last_offset = static_cast(off) + last_size; } - void XMLQuadIterator::Impl::CollectionState::on_characters(Impl *impl, std::string_view const chars) { + void IStreamQuadIterator::ImplXML::CollectionState::on_characters(ImplXML *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected element, found characters"); } } - void XMLQuadIterator::Impl::CollectionState::on_start_element(Impl *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + void IStreamQuadIterator::ImplXML::CollectionState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { DescriptionState::enter(impl, local_name, uri, attributes, [&](Node const obj) { if (first) { first = false; @@ -744,7 +761,7 @@ namespace rdf4cpp::parser { impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_first), obj, IRI::make_null()); }); } - void XMLQuadIterator::Impl::CollectionState::on_end_element(Impl *impl) { + void IStreamQuadIterator::ImplXML::CollectionState::on_end_element(ImplXML *impl) { auto const nil = impl->make_hardcoded_iri(iri_nil); if (first) { impl->add_statement(subject, predicate, nil, reify); @@ -753,19 +770,19 @@ namespace rdf4cpp::parser { } impl->pop_state(); } - void XMLQuadIterator::Impl::EmptyElement::on_characters(Impl *impl, std::string_view const chars) { + void IStreamQuadIterator::ImplXML::EmptyElement::on_characters(ImplXML *impl, std::string_view const chars) { if (!trim(chars).empty()) { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); } } - void XMLQuadIterator::Impl::EmptyElement::on_start_element(Impl *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + void IStreamQuadIterator::ImplXML::EmptyElement::on_start_element(ImplXML *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???"); } - void XMLQuadIterator::Impl::EmptyElement::on_end_element(Impl *impl) { + void IStreamQuadIterator::ImplXML::EmptyElement::on_end_element(ImplXML *impl) { impl->pop_state(); } - XMLQuadIterator::Impl::Impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state) + IStreamQuadIterator::ImplXML::ImplXML(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state) : handler_(make_sax_handler()), context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "mem")), reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof), @@ -776,7 +793,7 @@ namespace rdf4cpp::parser { current_state_->base = IRIFactory::default_base; } - std::optional XMLQuadIterator::Impl::next() { + std::optional IStreamQuadIterator::ImplXML::next() { std::array buffer; // NOLINT(*-pro-type-member-init) while (result_queue_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) { auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_); @@ -789,43 +806,15 @@ namespace rdf4cpp::parser { result_queue_.pop_front(); return r; } - - - XMLQuadIterator::XMLQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof, state_type* state) - : impl_(std::make_unique(stream, read, error, eof, state)), cur_(impl_->next()) { + uint64_t IStreamQuadIterator::ImplXML::current_line() const noexcept { + return xmlSAX2GetLineNumber(context_.get()); } - XMLQuadIterator::XMLQuadIterator(std::istream &stream, state_type* state) - : XMLQuadIterator(&stream, - [](void *buf, [[maybe_unused]] size_t elem_size, size_t count, void *voided_self) noexcept -> size_t { - RDF4CPP_ASSERT(elem_size == 1); - - auto *self = static_cast(voided_self); - self->read(static_cast(buf), static_cast(count)); - return self->gcount(); - }, - [](void *voided_self) noexcept { - auto *self = static_cast(voided_self); - return static_cast(self->fail() && !self->eof()); - }, - [](void *voided_self) noexcept { - auto *self = static_cast(voided_self); - return static_cast(self->eof()); - }, state) - { + uint64_t IStreamQuadIterator::ImplXML::current_column() const noexcept { + return xmlSAX2GetColumnNumber(context_.get()); } - XMLQuadIterator::~XMLQuadIterator() noexcept = default; - XMLQuadIterator::reference XMLQuadIterator::operator*() const noexcept { - return *cur_; - } - XMLQuadIterator::pointer XMLQuadIterator::operator->() const noexcept { - return &*cur_; - } - XMLQuadIterator &XMLQuadIterator::operator++() { - cur_ = impl_->next(); - return *this; - } - bool XMLQuadIterator::operator==(std::default_sentinel_t) const noexcept { - return !cur_.has_value(); + + std::unique_ptr IStreamQuadIterator::make_xml_impl(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state) { + return std::make_unique(obj, read, err, eof, state); } } // namespace rdf4cpp::parser diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.cpp b/src/rdf4cpp/parser/IStreamQuadIterator.cpp index 677a8522..5c3e9aae 100644 --- a/src/rdf4cpp/parser/IStreamQuadIterator.cpp +++ b/src/rdf4cpp/parser/IStreamQuadIterator.cpp @@ -1,4 +1,5 @@ #include "IStreamQuadIterator.hpp" + #include #include @@ -38,19 +39,33 @@ static int istream_error(void *voided_self) noexcept { return static_cast(self->fail() && !self->eof()); } +/** + * Adaptor function for serd to check if an std::istream is at the end of file + * + * @param voided_self pointer to std::istream cast to void * + * @return whether the given istream encountered an error (cast to int) + */ +static int istream_eof(void *voided_self) noexcept { + auto *self = static_cast(voided_self); + return static_cast(self->eof()); +} + IStreamQuadIterator::IStreamQuadIterator(void *stream, ReadFunc read, ErrorFunc error, + EOFFunc eof, flags_type flags, state_type *state) - : impl{std::make_unique(stream, read, error, flags, state)}, + : impl{flags.get_syntax() == ParsingFlag::RdfXml ? + make_xml_impl(stream, read, error, eof, state) : + std::make_unique(stream, read, error, flags, state)}, cur{impl->next()} { } IStreamQuadIterator::IStreamQuadIterator(std::istream &istream, flags_type flags, state_type *state) - : IStreamQuadIterator{&istream, &istream_read, &istream_error, flags, state} { + : IStreamQuadIterator{&istream, &istream_read, &istream_error, &istream_eof, flags, state} { } IStreamQuadIterator::IStreamQuadIterator(IStreamQuadIterator &&other) noexcept = default; diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.hpp b/src/rdf4cpp/parser/IStreamQuadIterator.hpp index 6cc27239..d8b00502 100644 --- a/src/rdf4cpp/parser/IStreamQuadIterator.hpp +++ b/src/rdf4cpp/parser/IStreamQuadIterator.hpp @@ -35,6 +35,15 @@ using ReadFunc = size_t (*)(void *buffer, size_t elem_size, size_t count, void * */ using ErrorFunc = int (*)(void *stream); +/** + * Identical semantics to feof. + * + * + * @param stream pointer to any object + * @return nonzero value if there is an error in stream, zero value otherwise + */ +using EOFFunc = int (*)(void *stream); + /** * Similar to std::istream_iterator<>. * Parses the given istream and tries to extract Quads given in TURTLE format. @@ -70,11 +79,35 @@ struct IStreamQuadIterator { using istream_type = std::istream; private: - struct Impl; + struct Impl { + virtual ~Impl() = default; + /** + * Tries to extract the next element from the backend. + * Will try to skip over errors so that the next call might be able to return a value. + * + * @note Call until std::nullopt is returned + * @return + * std::nullopt: if there is no next element (eof) + * expected Quad: if there was a next element and it could be parsed + * unexpected ParsingError: if there was a next element but it could not be parsed + */ + [[nodiscard]] virtual std::optional> next() = 0; + [[nodiscard]] virtual uint64_t current_line() const noexcept = 0; + [[nodiscard]] virtual uint64_t current_column() const noexcept = 0; + + Impl() = default; + Impl(Impl const &) = delete; + Impl(Impl&&) = delete; + Impl &operator=(Impl const &) = delete; + Impl &operator=(Impl &&) = delete; + }; + struct ImplSerd; + struct ImplXML; std::unique_ptr impl; std::optional> cur; + static std::unique_ptr make_xml_impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state); public: /** * Constructs a IStreamQuadIterator from a C-like io api. That is something similar to @@ -91,6 +124,7 @@ struct IStreamQuadIterator { IStreamQuadIterator(void *stream, ReadFunc read, ErrorFunc error, + EOFFunc eof, flags_type flags = ParsingFlags::none(), state_type *initial_state = nullptr); diff --git a/src/rdf4cpp/parser/ParsingFlags.hpp b/src/rdf4cpp/parser/ParsingFlags.hpp index 7f8faaa7..d49fec55 100644 --- a/src/rdf4cpp/parser/ParsingFlags.hpp +++ b/src/rdf4cpp/parser/ParsingFlags.hpp @@ -21,6 +21,9 @@ enum struct ParsingFlag : uint8_t { NTriples = 0b01 << 4, NQuads = 0b10 << 4, TriG = 0b11 << 4, + RdfXml = 0b100 << 4, + + SyntaxMask = 0b111 << 4, }; struct ParsingFlags { @@ -67,7 +70,7 @@ struct ParsingFlags { * @return the syntax ParsingFlag contained in this ParsingFlags. (Turtle if not specified) */ [[nodiscard]] constexpr ParsingFlag get_syntax() const noexcept { - return static_cast(flags & static_cast(ParsingFlag::TriG)); // TriG is 11, so it can double as a mask + return static_cast(flags & static_cast(ParsingFlag::SyntaxMask)); } [[nodiscard]] constexpr bool syntax_allows_prefixes() const noexcept { diff --git a/src/rdf4cpp/parser/RDFFileParser.cpp b/src/rdf4cpp/parser/RDFFileParser.cpp index 9c70d66c..055a2db0 100644 --- a/src/rdf4cpp/parser/RDFFileParser.cpp +++ b/src/rdf4cpp/parser/RDFFileParser.cpp @@ -30,7 +30,7 @@ RDFFileParser::iterator::iterator(FILE *&&stream, state_type *state) : stream_(stream), iter_(std::make_unique(stream_, reinterpret_cast(&fread), reinterpret_cast(&ferror), - flags, state)) { + reinterpret_cast(feof), flags, state)) { } RDFFileParser::iterator::~iterator() noexcept { fclose(stream_); diff --git a/src/rdf4cpp/parser/XMLParser.hpp b/src/rdf4cpp/parser/XMLParser.hpp deleted file mode 100644 index cfc0c341..00000000 --- a/src/rdf4cpp/parser/XMLParser.hpp +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef RDF4CPP_XMLPARSER_HPP -#define RDF4CPP_XMLPARSER_HPP - -#include -#include - -#include - -#include - -#include -#include -#include -#include - -namespace rdf4cpp::parser { - struct XMLQuadIterator { - using flags_type = ParsingFlags; - using state_type = ParsingState; - using ok_type = Quad; - using error_type = ParsingError; - - using value_type = nonstd::expected; - using reference = value_type const &; - using pointer = value_type const *; - using difference_type = std::ptrdiff_t; - using iterator_category = std::input_iterator_tag; - using istream_type = std::istream; - - /** - * Identical semantics to fread. - * Uses stream to read at most count elements of size element_size into buffer. - * - * @param buffer pointer to buffer with at least count elements of size elem_size - * @param elem_size sizeof each element - * @param count number of elements to read - * @param stream pointer to any object. - * @return number of elements read - */ - using ReadFunc = size_t (*)(void *buffer, size_t elem_size, size_t count, void *stream); - - /** - * Identical semantics to ferror. - * - * @param stream pointer to any object - * @return nonzero value if there is an error in stream, zero value otherwise - */ - using ErrorFunc = int (*)(void *stream); - - /** - * Identical semantics to feof. - * - * - * @param stream pointer to any object - * @return nonzero value if there is an error in stream, zero value otherwise - */ - using EOFFunc = int (*)(void *stream); - - private: - struct Impl; - - std::unique_ptr impl_; - std::optional> cur_; - - public: - XMLQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof, state_type* state = nullptr); - explicit XMLQuadIterator(std::istream& stream, state_type* state = nullptr); - - XMLQuadIterator(XMLQuadIterator&&) noexcept = delete; - XMLQuadIterator& operator=(XMLQuadIterator&&) noexcept = delete; - - XMLQuadIterator(XMLQuadIterator const &) = delete; - XMLQuadIterator& operator=(XMLQuadIterator const &) = delete; - - ~XMLQuadIterator() noexcept; - - reference operator*() const noexcept; - pointer operator->() const noexcept; - XMLQuadIterator &operator++(); - - bool operator==(std::default_sentinel_t) const noexcept; - }; -} - -#endif //RDF4CPP_XMLPARSER_H diff --git a/tests/bench_SerDe.cpp b/tests/bench_SerDe.cpp index d32d8bbe..64e6f732 100644 --- a/tests/bench_SerDe.cpp +++ b/tests/bench_SerDe.cpp @@ -27,6 +27,7 @@ void deserialize(std::filesystem::path const &in_path, Dataset &ds, storage::Dyn parser::IStreamQuadIterator qit{in_file, reinterpret_cast(&fread), reinterpret_cast(&ferror), + reinterpret_cast(&feof), parser::ParsingFlags::none(), &state}; diff --git a/tests/parser/tests_IStreamQuadIterator.cpp b/tests/parser/tests_IStreamQuadIterator.cpp index 7c705cd4..8df3754f 100644 --- a/tests/parser/tests_IStreamQuadIterator.cpp +++ b/tests/parser/tests_IStreamQuadIterator.cpp @@ -404,7 +404,7 @@ TEST_SUITE("IStreamQuadIterator") { } auto *f = fopen(path, "r"); - for (IStreamQuadIterator qit{f, reinterpret_cast(fread), reinterpret_cast(ferror)}; qit != std::default_sentinel; ++qit) { + for (IStreamQuadIterator qit{f, reinterpret_cast(fread), reinterpret_cast(ferror), reinterpret_cast(feof)}; qit != std::default_sentinel; ++qit) { FAIL("not empty"); } diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index b072d278..9f2adb00 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -1,11 +1,7 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include "rdf4cpp/parser/XMLParser.hpp" - - #include #include -#include #include @@ -41,7 +37,7 @@ TEST_CASE("sanity test") { )"}; - XMLQuadIterator it{str}; + IStreamQuadIterator it{str, ParsingFlag::RdfXml}; CHECK(it != std::default_sentinel); CHECK(it->has_value()); CHECK(it->value().subject() == IRI::make("https://www.example.com")); @@ -772,7 +768,7 @@ _:d1 "2" .)"; } std::stringstream xml_str{xml}; - XMLQuadIterator xml_iter{xml_str}; + IStreamQuadIterator xml_iter{xml_str, ParsingFlag::RdfXml}; std::stringstream nt_str{nt}; IStreamQuadIterator nt_iter{nt_str, ParsingFlag::NTriples}; @@ -860,7 +856,7 @@ TEST_CASE("rdf xml negative tests") { } std::stringstream xml_str{xml}; - XMLQuadIterator xml_iter{xml_str}; + IStreamQuadIterator xml_iter{xml_str, ParsingFlag::RdfXml}; while (xml_iter != std::default_sentinel) { if (!ignore_some_triples) { From 0098e624541e979f124101c9c80ac5059f19c2d0 Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 21 Nov 2025 14:50:15 +0100 Subject: [PATCH 21/42] try fix gcc error --- private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp | 2 ++ private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp index b687317b..78714b9a 100644 --- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp +++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp @@ -408,6 +408,8 @@ IStreamQuadIterator::ImplSerd::ImplSerd(void *stream, serd_reader_set_error_sink(this->reader.get(), &ImplSerd::on_error, this); serd_reader_start_source_stream(this->reader.get(), read, error, stream, nullptr, 4096); } +IStreamQuadIterator::ImplSerd::~ImplSerd() { +} std::optional> IStreamQuadIterator::ImplSerd::next() { while (this->quad_buffer.empty()) { diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp index fe605da0..8a6b2e55 100644 --- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp +++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp @@ -70,7 +70,7 @@ struct IStreamQuadIterator::ImplSerd final : Impl { flags_type flags, state_type *state) noexcept; - ~ImplSerd() override = default; + ~ImplSerd() override; /** * Tries to extract the next element from the serd backend. From 9251a7f5ba4f4345a255c10f40236e0b46f9f8c7 Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 21 Nov 2025 15:15:32 +0100 Subject: [PATCH 22/42] another gcc fix --- .../rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp index 8a6b2e55..5ed01348 100644 --- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp +++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp @@ -21,10 +21,15 @@ struct IStreamQuadIterator::ImplSerd final : Impl { using error_type = IStreamQuadIterator::error_type; private: - std::unique_ptr reader; + // workaround for gcc-14 bug, erroneously warns on unsing a lambda here + // see https://github.com/NVIDIA/stdexec/issues/1143 + struct SerdReaderDtorLambda { + void operator()(SerdReader* r) const { + serd_reader_end_stream(r); + serd_reader_free(r); + } + }; + std::unique_ptr reader; std::unique_ptr state_owned = nullptr; state_type *state; From 48adc713c416ed643f9ff41d6d780f0b379ac560 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 26 Nov 2025 15:55:14 +0100 Subject: [PATCH 23/42] cleanup --- conanfile.py | 1 - private/rdf4cpp/parser/XMLParser.cpp | 562 +++++++++++++++------------ tests/parser/tests_XMLParser.cpp | 26 +- 3 files changed, 331 insertions(+), 258 deletions(-) diff --git a/conanfile.py b/conanfile.py index dfe224a2..6fffee96 100644 --- a/conanfile.py +++ b/conanfile.py @@ -40,7 +40,6 @@ def requirements(self): self.requires("dice-sparse-map/0.2.9", transitive_headers=True) self.requires("dice-template-library/1.13.0", transitive_headers=True) self.requires("libxml2/2.15.0") - self.requires("zlib/1.3.1", force=True) if self.options.with_test_deps: self.test_requires("doctest/2.4.11") diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 82d41c72..7815ed86 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -1,16 +1,15 @@ +#include #include #include -#include #include #include -#include -#include -#include #include #include +#include +#include #include @@ -22,8 +21,11 @@ namespace rdf4cpp::parser { struct IStreamQuadIterator::ImplXML final : Impl { private: xmlSAXHandler handler_; - std::unique_ptr context_; - void* reader_obj_; + std::unique_ptr + context_; + void *reader_obj_; ReadFunc read_func_; ErrorFunc error_func_; EOFFunc eof_func_; @@ -38,21 +40,23 @@ namespace rdf4cpp::parser { static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; - static std::string_view from_xml_char(xmlChar const * s) { + static std::string_view from_xml_char(xmlChar const *s) { if (s == nullptr) { return ""; } // ReSharper disable once CppDFALocalValueEscapesFunction return {reinterpret_cast(s)}; } - static std::string_view from_xml_char(xmlChar const * s, xmlChar const * e) { + + static std::string_view from_xml_char(xmlChar const *s, xmlChar const *e) { if (s == nullptr) { return ""; } // ReSharper disable once CppDFALocalValueEscapesFunction return {reinterpret_cast(s), reinterpret_cast(e)}; } - static std::string_view from_xml_char(xmlChar const * s, int const n) { + + static std::string_view from_xml_char(xmlChar const *s, int const n) { if (s == nullptr) { return ""; } @@ -70,9 +74,11 @@ namespace rdf4cpp::parser { [[nodiscard]] std::string_view value() const { return from_xml_char(value_start_raw, value_end_raw); } + [[nodiscard]] std::string_view local_name() const { return from_xml_char(local_name_raw); } + [[nodiscard]] std::string_view uri() const { return from_xml_char(uri_raw); } @@ -80,9 +86,9 @@ namespace rdf4cpp::parser { struct BaseState { // NOLINT(*-special-member-functions) virtual ~BaseState() = default; - virtual void on_characters(ImplXML *impl, std::string_view chars) = 0; - virtual void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; - virtual void on_end_element(ImplXML *impl) = 0; + virtual void on_characters(ImplXML &impl, std::string_view chars) = 0; + virtual void on_start_element(ImplXML &impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; + virtual void on_end_element(ImplXML &impl) = 0; struct InheritedAttributeInfo { std::string_view base = ""; @@ -92,33 +98,39 @@ namespace rdf4cpp::parser { std::string base; std::string lang_tag; - explicit BaseState(InheritedAttributeInfo const &i) : base(i.base), lang_tag(i.lang_tag) {} + explicit BaseState(InheritedAttributeInfo const &i) + : base(i.base), lang_tag(i.lang_tag) { + } static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; - static InheritedAttributeInfo get_inherited_attributes(ImplXML *impl, std::span attributes); + static InheritedAttributeInfo get_inherited_attributes(ImplXML &impl, std::span attributes); }; struct InitialState final : BaseState { - void on_characters(ImplXML *impl, std::string_view chars) override; - void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML *impl) override; + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; - InitialState() : BaseState({}) {} + InitialState() + : BaseState({}) { + } }; + struct RDFState final : BaseState { - void on_characters(ImplXML *impl, std::string_view chars) override; - void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML *impl) override; + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; using BaseState::BaseState; }; + struct DescriptionState final : BaseState { - void on_characters(ImplXML *impl, std::string_view chars) override; - void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML *impl) override; + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; Node subject; size_t list_current = 1; @@ -128,7 +140,7 @@ namespace rdf4cpp::parser { } template - static void enter(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes, F f); + static void enter(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes, F f); static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; @@ -136,10 +148,11 @@ namespace rdf4cpp::parser { static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; }; + struct PredicateState : BaseState { - void on_characters(ImplXML *impl, std::string_view chars) override; - void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML *impl) override; + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; Node subject; IRI predicate; @@ -160,9 +173,10 @@ namespace rdf4cpp::parser { static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); }; + struct TypedLiteralPredicateState final : PredicateState { - void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML *impl) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; IRI datatype; @@ -172,10 +186,11 @@ namespace rdf4cpp::parser { static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; }; + struct XMLLiteralState final : PredicateState { - void on_characters(ImplXML *impl, std::string_view chars) override; - void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML *impl) override; + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; size_t depth = 0; size_t data_start = 0; @@ -184,12 +199,13 @@ namespace rdf4cpp::parser { using PredicateState::PredicateState; - void source_input(ImplXML *impl); + void source_input(ImplXML &i); }; + struct CollectionState final : BaseState { - void on_characters(ImplXML *impl, std::string_view chars) override; - void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML *impl) override; + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; Node subject; IRI predicate; @@ -197,18 +213,23 @@ namespace rdf4cpp::parser { IRI reify; bool first = true; - CollectionState(InheritedAttributeInfo const &i, Node sub, IRI pred, IRI reify) : BaseState(i), subject(sub), predicate(pred), reify(reify) {} + CollectionState(InheritedAttributeInfo const &i, Node sub, IRI pred, IRI reify) + : BaseState(i), subject(sub), predicate(pred), reify(reify) { + } static constexpr std::string_view iri_nil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; }; + struct EmptyElement final : BaseState { - void on_characters(ImplXML *impl, std::string_view chars) override; - void on_start_element(ImplXML *impl, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML *impl) override; + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; - EmptyElement() :BaseState({}) {} + EmptyElement() + : BaseState({}) { + } }; BaseState *current_state_ = nullptr; @@ -220,29 +241,43 @@ namespace rdf4cpp::parser { /** * add statement to the output list, if none of the components is null * (null is used to track an already inserted parse error for that component) - * @param subject - * @param predicate - * @param object */ void add_statement(Node subject, IRI predicate, Node object, IRI reify); void update_current_state(); void pop_state(); - static std::string_view trim(std::string_view v); - static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); - static bool iri_reserved(std::string_view uri, std::string_view local_name); + /** + * removes whitespace according to xml spec + */ + [[nodiscard]] static std::string_view trim_left(std::string_view v); + [[nodiscard]] static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); + [[nodiscard]] static bool iri_reserved(std::string_view uri, std::string_view local_name); template - NT inspect_node(NT node); - IRI make_iri(std::string_view iri, std::string_view base); - IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base); - IRI make_id(std::string_view local_name, std::string_view base); + [[nodiscard]] NT inspect_node(NT node); + [[nodiscard]] IRI make_iri(std::string_view iri, std::string_view base); + [[nodiscard]] IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base); + /** + * create the IRI for an id_attrib, including uniqueness check + */ + [[nodiscard]] IRI make_id(std::string_view local_name, std::string_view base); + /** + * create an IRI with no checks, intended for hardcoded IRIs like reify_subject + */ [[nodiscard]] IRI make_hardcoded_iri(std::string_view iri) const; - Node make_bn(std::optional name); - Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag); + [[nodiscard]] IRI make_type_iri() const; + [[nodiscard]] Node make_bn(std::optional name); + /** + * creates a literal + * @param value + * @param datatype + * @param lang_tag (ignored, if datatype is set) + * @return + */ + [[nodiscard]] Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag); static void on_error(void *th, char const *msg, ...); public: - ImplXML(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state); + ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); [[nodiscard]] std::optional next() override; @@ -263,18 +298,18 @@ namespace rdf4cpp::parser { }; r.characters = [](void *th, xmlChar const *e, int const len) { auto *t = static_cast(th); - t->current_state_->on_characters(t, from_xml_char(e, len)); + t->current_state_->on_characters(*t, from_xml_char(e, len)); }; r.startElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { auto *t = static_cast(th); - t->current_state_->on_start_element(t, from_xml_char(local_name), from_xml_char(uri), + t->current_state_->on_start_element(*t, from_xml_char(local_name), from_xml_char(uri), std::span{reinterpret_cast(attributes), static_cast(n_attributes)}); }; r.endElementNs = [](void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { auto *t = static_cast(th); - t->current_state_->on_end_element(t); + t->current_state_->on_end_element(*t); }; r.warning = on_error; r.error = on_error; @@ -286,31 +321,38 @@ namespace rdf4cpp::parser { uint64_t const col = xmlSAX2GetColumnNumber(context_.get()); result_queue_.emplace_back(nonstd::unexpect, ty, lin, col, std::move(msg)); } + void IStreamQuadIterator::ImplXML::add_statement(Node const subject, IRI const predicate, Node const object, IRI const reify) { if (subject.null() || predicate.null() || object.null()) { return; } result_queue_.emplace_back(Quad(subject, predicate, object)); if (!reify.null()) { - result_queue_.emplace_back(Quad(reify, IRI::make_unchecked(reify_subject, state_->node_storage), subject)); - result_queue_.emplace_back(Quad(reify, IRI::make_unchecked(reify_predicate, state_->node_storage), predicate)); - result_queue_.emplace_back(Quad(reify, IRI::make_unchecked(reify_object, state_->node_storage), object)); - result_queue_.emplace_back(Quad(reify, IRI::rdf_type(state_->node_storage), IRI::make_unchecked(reify_type, state_->node_storage))); + result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_subject), subject)); + result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_predicate), predicate)); + result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_object), object)); + result_queue_.emplace_back(Quad(reify, make_type_iri(), make_hardcoded_iri(reify_type))); } } + void IStreamQuadIterator::ImplXML::update_current_state() { if (state_stack_.empty()) { current_state_ = nullptr; return; } - current_state_ = std::visit([](auto &s) -> BaseState * { return &s; }, state_stack_.back()); + current_state_ = std::visit([](auto &s) -> BaseState * { + return &s; + }, + state_stack_.back()); } + void IStreamQuadIterator::ImplXML::pop_state() { assert(!state_stack_.empty()); state_stack_.pop_back(); update_current_state(); } - std::string_view IStreamQuadIterator::ImplXML::trim(std::string_view v) { + + std::string_view IStreamQuadIterator::ImplXML::trim_left(std::string_view v) { auto s = v.find_first_not_of(" \t\r\n"); if (s == std::string_view::npos) { return ""; @@ -319,56 +361,60 @@ namespace rdf4cpp::parser { // ReSharper disable once CppDFALocalValueEscapesFunction return v; } + bool IStreamQuadIterator::ImplXML::iri_equal_pieces(std::string_view const full_iri, std::string_view const uri, std::string_view const local_name) { if (full_iri.size() != local_name.size() + uri.size()) { return false; } return full_iri.starts_with(uri) && full_iri.ends_with(local_name); } + bool IStreamQuadIterator::ImplXML::iri_reserved(std::string_view uri, std::string_view local_name) { static constexpr std::array reserved = { - RDFState::start_element, - DescriptionState::id_attrib, - DescriptionState::about_attrib, - PredicateState::parse_type_attrib, - PredicateState::resource_attrib, - DescriptionState::node_id_attrib, - TypedLiteralPredicateState::datatype_attrib, - BaseState::base_attribute, - BaseState::lang_attribute, - std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"), - std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"), - std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"), + RDFState::start_element, + DescriptionState::id_attrib, + DescriptionState::about_attrib, + PredicateState::parse_type_attrib, + PredicateState::resource_attrib, + DescriptionState::node_id_attrib, + TypedLiteralPredicateState::datatype_attrib, + BaseState::base_attribute, + BaseState::lang_attribute, + std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"), + std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"), + std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"), }; return std::ranges::any_of(reserved, [&](std::string_view const e) { return iri_equal_pieces(e, uri, local_name); }); } + template NT IStreamQuadIterator::ImplXML::inspect_node(NT node) { try { state_->inspect_node_func(node); return node; - } - catch (std::exception &e) { + } catch (std::exception &e) { add_error(ParsingError::Type::BadSyntax, std::format("Triple explicitly skipped by inspect function: {}", e.what())); - } - catch (...) { + } catch (...) { add_error(ParsingError::Type::BadSyntax, "Triple explicitly skipped by inspect function"); } return NT::make_null(); } + IRI IStreamQuadIterator::ImplXML::make_iri(std::string_view const iri, std::string_view const base) { if (base.empty()) { - for (const auto &s : state_stack_ | std::ranges::views::reverse) { - auto const v = std::visit([](const auto& s) -> std::string_view { return s.base; }, s); + for (auto const &s : state_stack_ | std::ranges::views::reverse) { + auto const v = std::visit([](auto const &s) -> std::string_view { + return s.base; + }, + s); if (!v.empty()) { state_->iri_factory.set_base_unchecked(v); break; } } - } - else { + } else { state_->iri_factory.set_base_unchecked(base); } auto exp = state_->iri_factory.from_maybe_relative(iri, state_->node_storage); @@ -379,11 +425,13 @@ namespace rdf4cpp::parser { return IRI::make_null(); } } + IRI IStreamQuadIterator::ImplXML::make_iri(std::string_view const uri, std::string_view const local_name, std::string_view const base) { std::string iri{uri}; iri.append(local_name); return make_iri(iri, base); } + IRI IStreamQuadIterator::ImplXML::make_id(std::string_view const local_name, std::string_view const base) { std::string local = "#"; local.append(local_name); @@ -395,9 +443,15 @@ namespace rdf4cpp::parser { reserved_ids_.insert(iri); return iri; } + IRI IStreamQuadIterator::ImplXML::make_hardcoded_iri(std::string_view const iri) const { return IRI::make_unchecked(iri, state_->node_storage); } + + IRI IStreamQuadIterator::ImplXML::make_type_iri() const { + return IRI::rdf_type(state_->node_storage); + } + Node IStreamQuadIterator::ImplXML::make_bn(std::optional name) { std::string n = ""; if (!name.has_value()) { @@ -405,23 +459,20 @@ namespace rdf4cpp::parser { name = n; } try { - if (state_->blank_node_scope_manager == nullptr) - { + if (state_->blank_node_scope_manager == nullptr) { return inspect_node(BlankNode::make(*name)); - } - else { + } else { return inspect_node(state_->blank_node_scope_manager.scope("").get_or_generate_node(*name, state_->node_storage)); } - } - catch (InvalidNode const &e) { + } catch (InvalidNode const &e) { add_error(ParsingError::Type::BadBlankNode, e.what()); return BlankNode::make_null(); - } - catch (...) { + } catch (...) { add_error(ParsingError::Type::BadBlankNode, "unknown error"); return BlankNode::make_null(); } } + Literal IStreamQuadIterator::ImplXML::make_literal(std::string_view value, std::optional datatype, std::optional lang_tag) { Literal l = Literal::make_null(); try { @@ -429,8 +480,11 @@ namespace rdf4cpp::parser { l = Literal::make_typed(value, *datatype, state_->node_storage); } else { if (!lang_tag.has_value() || lang_tag->empty()) { - for (const auto &s : state_stack_ | std::ranges::views::reverse) { - auto const v = std::visit([](const auto& s) -> std::string_view { return s.lang_tag; }, s); + for (auto const &s : state_stack_ | std::ranges::views::reverse) { + auto const v = std::visit([](auto const &s) -> std::string_view { + return s.lang_tag; + }, + s); if (!v.empty()) { lang_tag = v; break; @@ -439,20 +493,18 @@ namespace rdf4cpp::parser { } if (lang_tag.has_value() && !lang_tag->empty()) { l = Literal::make_lang_tagged(value, *lang_tag, state_->node_storage); - } - else { + } else { l = Literal::make_simple(value); } } - } - catch (InvalidNode const &e) { + } catch (InvalidNode const &e) { add_error(ParsingError::Type::BadLiteral, e.what()); - } - catch (...) { + } catch (...) { add_error(ParsingError::Type::BadLiteral, "unknown error"); } return inspect_node(l); } + void IStreamQuadIterator::ImplXML::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) va_list args; auto t = static_cast(th); @@ -460,8 +512,7 @@ namespace rdf4cpp::parser { std::string out{}; out.resize(1024, '\0'); auto l = vsnprintf(out.data(), out.size(), msg, args); // NOLINT(*-pro-bounds-array-to-pointer-decay) - if (l > 0) - { + if (l > 0) { out.resize(l); } else { out = "unknown error, too long to fit"; @@ -470,65 +521,68 @@ namespace rdf4cpp::parser { va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } - IStreamQuadIterator::ImplXML::BaseState::InheritedAttributeInfo IStreamQuadIterator::ImplXML::BaseState::get_inherited_attributes(ImplXML *impl, std::span const attributes) { + IStreamQuadIterator::ImplXML::BaseState::InheritedAttributeInfo IStreamQuadIterator::ImplXML::BaseState::get_inherited_attributes(ImplXML &impl, std::span const attributes) { InheritedAttributeInfo r{}; - for (const auto& a : attributes) { - if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) - { + for (auto const &a : attributes) { + if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) { if (auto e = IRIView(a.value()).quick_validate(); e != IRIFactoryError::Ok) { - impl->add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", e, a.value())); + impl.add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", e, a.value())); } r.base = a.value(); - } - else if (iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) - { + } else if (iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) { r.lang_tag = a.value(); } } return r; } - void IStreamQuadIterator::ImplXML::InitialState::on_characters(ImplXML *impl, std::string_view const chars) { - if (!trim(chars).empty()) { - impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); + + void IStreamQuadIterator::ImplXML::InitialState::on_characters(ImplXML &i, std::string_view const chars) { + if (!trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); } } - void IStreamQuadIterator::ImplXML::InitialState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { + + void IStreamQuadIterator::ImplXML::InitialState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { - impl->state_stack_.emplace_back(std::in_place_type_t{}, get_inherited_attributes(impl, attributes)); - impl->update_current_state(); + i.state_stack_.emplace_back(std::in_place_type_t{}, get_inherited_attributes(i, attributes)); + i.update_current_state(); return; } - impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); + i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); } - void IStreamQuadIterator::ImplXML::InitialState::on_end_element(ImplXML *impl) { - impl->add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); + + void IStreamQuadIterator::ImplXML::InitialState::on_end_element(ImplXML &i) { + i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); } - void IStreamQuadIterator::ImplXML::RDFState::on_characters(ImplXML *impl, std::string_view const chars) { - if (!trim(chars).empty()) { - impl->add_error(ParsingError::Type::BadSyntax, "expected Description, found characters"); + void IStreamQuadIterator::ImplXML::RDFState::on_characters(ImplXML &i, std::string_view const chars) { + if (!trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected Description, found characters"); } } - void IStreamQuadIterator::ImplXML::RDFState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - DescriptionState::enter(impl, local_name, uri, attributes, [](auto) {}); + + void IStreamQuadIterator::ImplXML::RDFState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + DescriptionState::enter(i, local_name, uri, attributes, [](auto) { + }); } - void IStreamQuadIterator::ImplXML::RDFState::on_end_element(ImplXML *impl) { - impl->pop_state(); + + void IStreamQuadIterator::ImplXML::RDFState::on_end_element(ImplXML &i) { + i.pop_state(); } - void IStreamQuadIterator::ImplXML::DescriptionState::on_characters(ImplXML *impl, std::string_view const chars) { - if (!trim(chars).empty()) { - impl->add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters"); + void IStreamQuadIterator::ImplXML::DescriptionState::on_characters(ImplXML &i, std::string_view const chars) { + if (!trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters"); } } - void IStreamQuadIterator::ImplXML::DescriptionState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span attributes) { - auto const i = get_inherited_attributes(impl, attributes); + + void IStreamQuadIterator::ImplXML::DescriptionState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span attributes) { + auto const inherited_attribute_info = get_inherited_attributes(i, attributes); IRI predicate; if (iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { - predicate = impl->make_iri(std::format("http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}", list_current++), i.base); - } - else { - predicate = impl->make_iri(uri, local_name, i.base); + predicate = i.make_iri(std::format("http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}", list_current++), inherited_attribute_info.base); + } else { + predicate = i.make_iri(uri, local_name, inherited_attribute_info.base); } std::optional datatype = std::nullopt; std::optional sub = std::nullopt; @@ -538,13 +592,13 @@ namespace rdf4cpp::parser { bool parse_collection = false; for (auto const &att : attributes) { if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { - datatype = impl->make_iri(att.value(), i.base); + datatype = i.make_iri(att.value(), inherited_attribute_info.base); } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { - sub = impl->make_iri(att.value(), i.base); + sub = i.make_iri(att.value(), inherited_attribute_info.base); } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { - sub = impl->make_bn(att.value()); + sub = i.make_bn(att.value()); } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { - reify = impl->make_id(att.value(), i.base); + reify = i.make_id(att.value(), inherited_attribute_info.base); } else if (iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { if (att.value() == PredicateState::parse_type_resource) { parse_resource = true; @@ -560,49 +614,56 @@ namespace rdf4cpp::parser { continue; } if (!sub.has_value()) { - sub = impl->make_bn(std::nullopt); + sub = i.make_bn(std::nullopt); } if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = impl->make_iri(att.value(), base); - impl->add_statement(*sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); + IRI const obj = i.make_iri(att.value(), base); + i.add_statement(*sub, i.make_type_iri(), obj, IRI::make_null()); } else { - IRI const pred = impl->make_iri(att.uri(), att.local_name(), base); - Literal const obj = impl->make_literal(att.value(), std::nullopt, i.lang_tag); - impl->add_statement(*sub, pred, obj, IRI::make_null()); + IRI const pred = i.make_iri(att.uri(), att.local_name(), base); + Literal const obj = i.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag); + i.add_statement(*sub, pred, obj, IRI::make_null()); } } if (sub.has_value() && (parse_collection || parse_literal || parse_resource)) { - impl->add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); + i.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); } if (datatype.has_value()) { - impl->state_stack_.emplace_back(std::in_place_type_t{}, i, subject, predicate, reify, *datatype); + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, *datatype); } else if (sub.has_value()) { - impl->add_statement(subject, predicate, *sub, reify); - impl->state_stack_.emplace_back(std::in_place_type_t{}); + i.add_statement(subject, predicate, *sub, reify); + i.state_stack_.emplace_back(std::in_place_type_t{}); } else if (parse_resource) { - Node obj = impl->make_bn(std::nullopt); - impl->add_statement(subject, predicate, obj, reify); - impl->state_stack_.emplace_back(std::in_place_type_t{}, i, obj); - } else if (parse_literal) { // TODO tests - auto& xml_state = impl->state_stack_.emplace_back(std::in_place_type_t{}, i, subject, predicate, reify); - std::visit([&](T& o) { if constexpr (std::same_as) { o.source_input(impl); }}, xml_state); + Node obj = i.make_bn(std::nullopt); + i.add_statement(subject, predicate, obj, reify); + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, obj); + } else if (parse_literal) { + auto &xml_state = i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); + std::visit([&](T &o) { + if constexpr (std::same_as) { + o.source_input(i); + } + }, + xml_state); } else if (parse_collection) { - impl->state_stack_.emplace_back(std::in_place_type_t{}, i, subject, predicate, reify); + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); } else { - impl->state_stack_.emplace_back(std::in_place_type_t{}, i, subject, predicate, reify); + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); } - impl->update_current_state(); + i.update_current_state(); } - void IStreamQuadIterator::ImplXML::DescriptionState::on_end_element(ImplXML *impl) { - impl->pop_state(); + + void IStreamQuadIterator::ImplXML::DescriptionState::on_end_element(ImplXML &i) { + i.pop_state(); } + template - void IStreamQuadIterator::ImplXML::DescriptionState::enter(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { - auto const i = get_inherited_attributes(impl, attributes); + void IStreamQuadIterator::ImplXML::DescriptionState::enter(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { + auto const inherited_attribute_info = get_inherited_attributes(i, attributes); Node sub = Node::make_null(); - auto check_only_one = [&sub, impl]() { + auto check_only_one = [&sub, &i]() { if (!sub.null()) { - impl->add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:ID, rdf:about, and rdf:nodeID"); + i.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:ID, rdf:about, and rdf:nodeID"); return true; } return false; @@ -612,28 +673,26 @@ namespace rdf4cpp::parser { if (check_only_one()) { continue; } - sub = impl->make_iri(att.value(), i.base); + sub = i.make_iri(att.value(), inherited_attribute_info.base); } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; } - sub = impl->make_id(att.value(), i.base); + sub = i.make_id(att.value(), inherited_attribute_info.base); } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { if (check_only_one()) { continue; } - sub = impl->make_bn(att.value()); + sub = i.make_bn(att.value()); } } - if (sub.null()) - { - sub = impl->make_bn(std::nullopt); + if (sub.null()) { + sub = i.make_bn(std::nullopt); } if (!iri_equal_pieces(start_element, uri, local_name)) { - IRI const obj = impl->make_iri(uri, local_name, i.base); - if (!obj.null()) - { - impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); + IRI const obj = i.make_iri(uri, local_name, inherited_attribute_info.base); + if (!obj.null()) { + i.add_statement(sub, i.make_type_iri(), obj, IRI::make_null()); } } for (auto const &att : attributes) { @@ -641,98 +700,104 @@ namespace rdf4cpp::parser { continue; } if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = impl->make_iri(att.value(), i.base); - impl->add_statement(sub, IRI::rdf_type(impl->state_->node_storage), obj, IRI::make_null()); + IRI const obj = i.make_iri(att.value(), inherited_attribute_info.base); + i.add_statement(sub, i.make_type_iri(), obj, IRI::make_null()); } else { - IRI const pred = impl->make_iri(att.uri(), att.local_name(), i.base); - Literal const obj = impl->make_literal(att.value(), std::nullopt, i.lang_tag); - impl->add_statement(sub, pred, obj, IRI::make_null()); + IRI const pred = i.make_iri(att.uri(), att.local_name(), inherited_attribute_info.base); + Literal const obj = i.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag); + i.add_statement(sub, pred, obj, IRI::make_null()); } } f(sub); - impl->state_stack_.emplace_back(std::in_place_type_t{}, i, sub); - impl->update_current_state(); + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, sub); + i.update_current_state(); } - void IStreamQuadIterator::ImplXML::PredicateState::on_characters([[maybe_unused]] ImplXML *impl, std::string_view const chars) { + void IStreamQuadIterator::ImplXML::PredicateState::on_characters([[maybe_unused]] ImplXML &i, std::string_view const chars) { if (done) { - if (!trim(chars).empty()) { - impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal"); + if (!trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal"); } return; } literal.append(chars); } - void IStreamQuadIterator::ImplXML::PredicateState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - if (!trim(literal).empty()) { - impl->add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element"); + + void IStreamQuadIterator::ImplXML::PredicateState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + if (!trim_left(literal).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element"); return; } if (done) { - impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found element"); + i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found element"); return; } - DescriptionState::enter(impl, local_name, uri, attributes, [&](Node obj) { + DescriptionState::enter(i, local_name, uri, attributes, [&](Node obj) { done = true; - impl->add_statement(subject, predicate, obj, reify); + i.add_statement(subject, predicate, obj, reify); }); } - void IStreamQuadIterator::ImplXML::PredicateState::on_end_element(ImplXML *impl) { + + void IStreamQuadIterator::ImplXML::PredicateState::on_end_element(ImplXML &i) { if (!done) { - Literal const lit = impl->make_literal(literal, std::nullopt, std::nullopt); - impl->add_statement(subject, predicate, lit, reify); + Literal const lit = i.make_literal(literal, std::nullopt, std::nullopt); + i.add_statement(subject, predicate, lit, reify); } - impl->pop_state(); + i.pop_state(); } bool IStreamQuadIterator::ImplXML::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(list_start_element, uri, local_name); } - void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_start_element(ImplXML *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { - impl->add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); + + void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + i.add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); } - void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_end_element(ImplXML *impl) { + + void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_end_element(ImplXML &i) { if (!datatype.null()) { - Literal const lit = impl->make_literal(literal, datatype, std::nullopt); - impl->add_statement(subject, predicate, lit, reify); + Literal const lit = i.make_literal(literal, datatype, std::nullopt); + i.add_statement(subject, predicate, lit, reify); } - impl->pop_state(); + i.pop_state(); } - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_characters(ImplXML *impl, [[maybe_unused]] std::string_view chars) { - source_input(impl); + + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_characters(ImplXML &i, [[maybe_unused]] std::string_view chars) { + source_input(i); } - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_start_element(ImplXML *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { ++depth; - source_input(impl); + source_input(i); } - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_end_element(ImplXML *impl) { + + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_end_element(ImplXML &i) { if (depth > 0) { --depth; - source_input(impl); + source_input(i); return; } - IRI datatype = impl->make_hardcoded_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"); + IRI datatype = i.make_hardcoded_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"); std::string_view l = literal; l = l.substr(0, last_offset); l.remove_prefix(data_start); - if (!l.empty() && l[0] == '/') - { + if (!l.empty() && l[0] == '/') { l.remove_prefix(1); } - if (!l.empty() && l[0] == '>') - { + if (!l.empty() && l[0] == '>') { l.remove_prefix(1); } - Literal const lit = impl->make_literal(l, datatype, std::nullopt); - impl->add_statement(subject, predicate, lit, reify); - impl->pop_state(); + Literal const lit = i.make_literal(l, datatype, std::nullopt); + i.add_statement(subject, predicate, lit, reify); + i.pop_state(); } - void IStreamQuadIterator::ImplXML::XMLLiteralState::source_input(ImplXML *impl) { - const xmlChar* data; + + void IStreamQuadIterator::ImplXML::XMLLiteralState::source_input(ImplXML &i) { + xmlChar const *data; int size = 1024; int off = 0; - xmlCtxtGetInputWindow(impl->context_.get(), 0, &data, &size, &off); - std::string_view const sv{reinterpret_cast(data), static_cast(size)}; + xmlCtxtGetInputWindow(i.context_.get(), 0, &data, &size, &off); + std::string_view const sv{reinterpret_cast(data), static_cast(size)}; if (literal.empty()) { data_start = off; } @@ -742,59 +807,66 @@ namespace rdf4cpp::parser { } last_offset = static_cast(off) + last_size; } - void IStreamQuadIterator::ImplXML::CollectionState::on_characters(ImplXML *impl, std::string_view const chars) { - if (!trim(chars).empty()) { - impl->add_error(ParsingError::Type::BadSyntax, "expected element, found characters"); + + void IStreamQuadIterator::ImplXML::CollectionState::on_characters(ImplXML &i, std::string_view const chars) { + if (!trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected element, found characters"); } } - void IStreamQuadIterator::ImplXML::CollectionState::on_start_element(ImplXML *impl, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - DescriptionState::enter(impl, local_name, uri, attributes, [&](Node const obj) { + + void IStreamQuadIterator::ImplXML::CollectionState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + DescriptionState::enter(i, local_name, uri, attributes, [&](Node const obj) { if (first) { first = false; - last_bn = impl->make_bn(std::nullopt); - impl->add_statement(subject, predicate, last_bn, reify); + last_bn = i.make_bn(std::nullopt); + i.add_statement(subject, predicate, last_bn, reify); } else { - auto const bn = impl->make_bn(std::nullopt); - impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_rest), bn, IRI::make_null()); + auto const bn = i.make_bn(std::nullopt); + i.add_statement(last_bn, i.make_hardcoded_iri(iri_rest), bn, IRI::make_null()); last_bn = bn; } - impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_first), obj, IRI::make_null()); + i.add_statement(last_bn, i.make_hardcoded_iri(iri_first), obj, IRI::make_null()); }); } - void IStreamQuadIterator::ImplXML::CollectionState::on_end_element(ImplXML *impl) { - auto const nil = impl->make_hardcoded_iri(iri_nil); + + void IStreamQuadIterator::ImplXML::CollectionState::on_end_element(ImplXML &i) { + auto const nil = i.make_hardcoded_iri(iri_nil); if (first) { - impl->add_statement(subject, predicate, nil, reify); + i.add_statement(subject, predicate, nil, reify); } else { - impl->add_statement(last_bn, impl->make_hardcoded_iri(iri_rest), nil, IRI::make_null()); + i.add_statement(last_bn, i.make_hardcoded_iri(iri_rest), nil, IRI::make_null()); } - impl->pop_state(); + i.pop_state(); } - void IStreamQuadIterator::ImplXML::EmptyElement::on_characters(ImplXML *impl, std::string_view const chars) { - if (!trim(chars).empty()) { - impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); + + void IStreamQuadIterator::ImplXML::EmptyElement::on_characters(ImplXML &i, std::string_view const chars) { + if (!trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); } } - void IStreamQuadIterator::ImplXML::EmptyElement::on_start_element(ImplXML *impl, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { - impl->add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???"); + + void IStreamQuadIterator::ImplXML::EmptyElement::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???"); } - void IStreamQuadIterator::ImplXML::EmptyElement::on_end_element(ImplXML *impl) { - impl->pop_state(); + + void IStreamQuadIterator::ImplXML::EmptyElement::on_end_element(ImplXML &i) { + i.pop_state(); } - IStreamQuadIterator::ImplXML::ImplXML(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state) + IStreamQuadIterator::ImplXML::ImplXML(void *obj, ReadFunc const read, ErrorFunc const err, EOFFunc const eof, state_type *state) : handler_(make_sax_handler()), context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "mem")), reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof), - owned_state_(state == nullptr ? std::make_unique() : nullptr), state_(state == nullptr ? owned_state_.get() : state){ + owned_state_(state == nullptr ? std::make_unique() : nullptr), state_(state == nullptr ? owned_state_.get() : state) { xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); + state_stack_.reserve(10); state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); current_state_->base = IRIFactory::default_base; } std::optional IStreamQuadIterator::ImplXML::next() { - std::array buffer; // NOLINT(*-pro-type-member-init) + std::array buffer; // NOLINT(*-pro-type-member-init) while (result_queue_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) { auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_); xmlParseChunk(context_.get(), buffer.data(), static_cast(read), eof_func_(reader_obj_) != 0); @@ -806,9 +878,11 @@ namespace rdf4cpp::parser { result_queue_.pop_front(); return r; } + uint64_t IStreamQuadIterator::ImplXML::current_line() const noexcept { return xmlSAX2GetLineNumber(context_.get()); } + uint64_t IStreamQuadIterator::ImplXML::current_column() const noexcept { return xmlSAX2GetColumnNumber(context_.get()); } @@ -817,4 +891,4 @@ namespace rdf4cpp::parser { std::unique_ptr IStreamQuadIterator::make_xml_impl(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state) { return std::make_unique(obj, read, err, eof, state); } -} // namespace rdf4cpp::parser +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 9f2adb00..8fe07cc9 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -749,19 +749,19 @@ _:d1 "2" .)"; nt = R"( "1" . "1-again" .)"; } -// SUBCASE("xml literal") { TODO -// xml = R"( -// -// -// -// -//
-//
-// -//
)"; -// nt = R"( "

"^^ .)"; -// } + SUBCASE("xml literal") { + xml = R"( + + + + +
+
+ +
)"; + nt = R"( "
"^^ .)"; + } if (xml.empty()) { return; From a2dbc6fc33b4911808f1e3af18118aa7d05d7684 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 26 Nov 2025 17:45:07 +0100 Subject: [PATCH 24/42] initial base --- private/rdf4cpp/parser/XMLParser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 7815ed86..dcdd9372 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -862,7 +862,7 @@ namespace rdf4cpp::parser { state_stack_.reserve(10); state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); - current_state_->base = IRIFactory::default_base; + current_state_->base = state_->iri_factory.get_base(); } std::optional IStreamQuadIterator::ImplXML::next() { From dfd809c0b4263ffb4c5fa035b5db02167f7f13d6 Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 27 Nov 2025 14:52:51 +0100 Subject: [PATCH 25/42] xmlliteral tests, cleanup --- private/rdf4cpp/parser/XMLParser.cpp | 6 ++++-- tests/parser/tests_XMLParser.cpp | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index dcdd9372..58e152ef 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -18,6 +18,8 @@ #include namespace rdf4cpp::parser { + // not in a header, because that would complicate linking it against libxml + // probably even requiring to make libxml public struct IStreamQuadIterator::ImplXML final : Impl { private: xmlSAXHandler handler_; @@ -460,7 +462,7 @@ namespace rdf4cpp::parser { } try { if (state_->blank_node_scope_manager == nullptr) { - return inspect_node(BlankNode::make(*name)); + return inspect_node(BlankNode::make(*name, state_->node_storage)); } else { return inspect_node(state_->blank_node_scope_manager.scope("").get_or_generate_node(*name, state_->node_storage)); } @@ -494,7 +496,7 @@ namespace rdf4cpp::parser { if (lang_tag.has_value() && !lang_tag->empty()) { l = Literal::make_lang_tagged(value, *lang_tag, state_->node_storage); } else { - l = Literal::make_simple(value); + l = Literal::make_simple(value, state_->node_storage); } } } catch (InvalidNode const &e) { diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 8fe07cc9..3d1f9f99 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -34,6 +34,8 @@ TEST_CASE("sanity test") { + + abc def
)"}; @@ -123,6 +125,18 @@ TEST_CASE("sanity test") { CHECK(it->value().predicate() == IRI::make("https://www.example.com/coll")); CHECK(it->value().object() == IRI::make("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil")); ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/a")); + CHECK(it->value().object() == Literal::make_typed(" ", IRI::make("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"))); + ++it; + CHECK(it != std::default_sentinel); + CHECK(it->has_value()); + CHECK(it->value().subject() == IRI::make("https://www.example.com")); + CHECK(it->value().predicate() == IRI::make("https://www.example.com/a")); + CHECK(it->value().object() == Literal::make_typed("abc def", IRI::make("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"))); + ++it; CHECK(it == std::default_sentinel); } From d64130a4b6fb21c6fdf03c9c5ed4f3573a1e87cd Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 28 Nov 2025 13:19:16 +0100 Subject: [PATCH 26/42] fixes --- .../parser/IStreamQuadIteratorSerdImpl.cpp | 11 ++++++-- .../parser/IStreamQuadIteratorSerdImpl.hpp | 2 +- private/rdf4cpp/parser/XMLParser.cpp | 27 +++++++++++++------ src/rdf4cpp/parser/ParsingFlags.hpp | 5 ++-- tests/bench_SerDe.cpp | 2 +- 5 files changed, 32 insertions(+), 15 deletions(-) diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp index 78714b9a..f605d307 100644 --- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp +++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp @@ -400,15 +400,22 @@ IStreamQuadIterator::ImplSerd::ImplSerd(void *stream, flags_type flags, state_type *initial_state) noexcept : reader{serd_reader_new(extract_syntax_from_flags(flags), this, nullptr, &ImplSerd::on_base, &ImplSerd::on_prefix, &ImplSerd::on_stmt, nullptr)}, - state_owned(initial_state == nullptr ? std::make_unique() : nullptr), - state{initial_state == nullptr ? state_owned.get() : initial_state}, + state{initial_state}, + state_is_owned{false}, flags{flags} { + if (this->state == nullptr) { + this->state = new state_type{}; + this->state_is_owned = true; + } serd_reader_set_strict(this->reader.get(), !flags.contains(ParsingFlag::Lax)); serd_reader_set_error_sink(this->reader.get(), &ImplSerd::on_error, this); serd_reader_start_source_stream(this->reader.get(), read, error, stream, nullptr, 4096); } IStreamQuadIterator::ImplSerd::~ImplSerd() { + if (this->state_is_owned) { + delete this->state; + } } std::optional> IStreamQuadIterator::ImplSerd::next() { diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp index 5ed01348..21b4bd7a 100644 --- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp +++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp @@ -31,8 +31,8 @@ struct IStreamQuadIterator::ImplSerd final : Impl { }; std::unique_ptr reader; - std::unique_ptr state_owned = nullptr; state_type *state; + bool state_is_owned; std::deque quad_buffer; std::optional last_error; diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 58e152ef..6743facd 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -33,9 +33,9 @@ namespace rdf4cpp::parser { EOFFunc eof_func_; std::deque result_queue_; size_t next_bn_index_ = 0; - std::unique_ptr owned_state_; state_type *state_; - dice::sparse_map::sparse_set reserved_ids_; + bool state_is_owned_ = false; + dice::sparse_map::sparse_set reserved_ids_; static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; @@ -280,6 +280,7 @@ namespace rdf4cpp::parser { public: ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); + ~ImplXML() override; [[nodiscard]] std::optional next() override; @@ -343,9 +344,8 @@ namespace rdf4cpp::parser { return; } current_state_ = std::visit([](auto &s) -> BaseState * { - return &s; - }, - state_stack_.back()); + return &s; + }, state_stack_.back()); } void IStreamQuadIterator::ImplXML::pop_state() { @@ -438,11 +438,11 @@ namespace rdf4cpp::parser { std::string local = "#"; local.append(local_name); auto iri = make_iri(local, base); - if (reserved_ids_.contains(iri)) { + if (reserved_ids_.contains(iri.backend_handle().id())) { add_error(ParsingError::Type::BadIri, std::format("{}: is already used as a rdf:ID", iri)); return IRI::make_null(); } - reserved_ids_.insert(iri); + reserved_ids_.insert(iri.backend_handle().id()); return iri; } @@ -859,13 +859,24 @@ namespace rdf4cpp::parser { : handler_(make_sax_handler()), context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "mem")), reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof), - owned_state_(state == nullptr ? std::make_unique() : nullptr), state_(state == nullptr ? owned_state_.get() : state) { + state_(state) { xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); state_stack_.reserve(10); state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); + + if (state_ == nullptr) { + state_ = new state_type(); + state_is_owned_ = true; + } + current_state_->base = state_->iri_factory.get_base(); } + IStreamQuadIterator::ImplXML::~ImplXML() { + if (state_is_owned_) { + delete state_; + } + } std::optional IStreamQuadIterator::ImplXML::next() { std::array buffer; // NOLINT(*-pro-type-member-init) diff --git a/src/rdf4cpp/parser/ParsingFlags.hpp b/src/rdf4cpp/parser/ParsingFlags.hpp index d49fec55..6da05a64 100644 --- a/src/rdf4cpp/parser/ParsingFlags.hpp +++ b/src/rdf4cpp/parser/ParsingFlags.hpp @@ -22,9 +22,8 @@ enum struct ParsingFlag : uint8_t { NQuads = 0b10 << 4, TriG = 0b11 << 4, RdfXml = 0b100 << 4, - - SyntaxMask = 0b111 << 4, }; +constexpr uint8_t ParsingFlag_SyntaxMask = 0b111 << 4; struct ParsingFlags { private: @@ -70,7 +69,7 @@ struct ParsingFlags { * @return the syntax ParsingFlag contained in this ParsingFlags. (Turtle if not specified) */ [[nodiscard]] constexpr ParsingFlag get_syntax() const noexcept { - return static_cast(flags & static_cast(ParsingFlag::SyntaxMask)); + return static_cast(flags & static_cast(ParsingFlag_SyntaxMask)); } [[nodiscard]] constexpr bool syntax_allows_prefixes() const noexcept { diff --git a/tests/bench_SerDe.cpp b/tests/bench_SerDe.cpp index 64e6f732..a221b514 100644 --- a/tests/bench_SerDe.cpp +++ b/tests/bench_SerDe.cpp @@ -27,7 +27,7 @@ void deserialize(std::filesystem::path const &in_path, Dataset &ds, storage::Dyn parser::IStreamQuadIterator qit{in_file, reinterpret_cast(&fread), reinterpret_cast(&ferror), - reinterpret_cast(&feof), + reinterpret_cast(&feof), parser::ParsingFlags::none(), &state}; From 5e8bac865a2bed12ff6a62efe77f9b41c86261c3 Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 28 Nov 2025 13:47:32 +0100 Subject: [PATCH 27/42] inplace poly --- private/rdf4cpp/parser/XMLParser.cpp | 65 ++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 6743facd..84ed3435 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -17,6 +17,8 @@ #include +#include + namespace rdf4cpp::parser { // not in a header, because that would complicate linking it against libxml // probably even requiring to make libxml public @@ -91,6 +93,7 @@ namespace rdf4cpp::parser { virtual void on_characters(ImplXML &impl, std::string_view chars) = 0; virtual void on_start_element(ImplXML &impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; virtual void on_end_element(ImplXML &impl) = 0; + virtual void move_to(BaseState *b) noexcept = 0; struct InheritedAttributeInfo { std::string_view base = ""; @@ -113,6 +116,7 @@ namespace rdf4cpp::parser { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; InitialState() : BaseState({}) { @@ -123,6 +127,7 @@ namespace rdf4cpp::parser { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; @@ -133,6 +138,7 @@ namespace rdf4cpp::parser { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; Node subject; size_t list_current = 1; @@ -155,6 +161,7 @@ namespace rdf4cpp::parser { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; Node subject; IRI predicate; @@ -179,6 +186,7 @@ namespace rdf4cpp::parser { struct TypedLiteralPredicateState final : PredicateState { void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; IRI datatype; @@ -193,6 +201,7 @@ namespace rdf4cpp::parser { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; size_t depth = 0; size_t data_start = 0; @@ -208,6 +217,7 @@ namespace rdf4cpp::parser { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; Node subject; IRI predicate; @@ -228,6 +238,7 @@ namespace rdf4cpp::parser { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; EmptyElement() : BaseState({}) { @@ -235,7 +246,7 @@ namespace rdf4cpp::parser { }; BaseState *current_state_ = nullptr; - std::vector> state_stack_; + std::vector> state_stack_; static xmlSAXHandler make_sax_handler(); @@ -282,6 +293,11 @@ namespace rdf4cpp::parser { ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); ~ImplXML() override; + ImplXML(ImplXML const &) = delete; + ImplXML &operator=(ImplXML const &) = delete; + ImplXML(ImplXML &&) = delete; + ImplXML &operator=(ImplXML &&) = delete; + [[nodiscard]] std::optional next() override; [[nodiscard]] uint64_t current_line() const noexcept override; @@ -343,9 +359,7 @@ namespace rdf4cpp::parser { current_state_ = nullptr; return; } - current_state_ = std::visit([](auto &s) -> BaseState * { - return &s; - }, state_stack_.back()); + current_state_ = &state_stack_.back().get(); } void IStreamQuadIterator::ImplXML::pop_state() { @@ -407,10 +421,7 @@ namespace rdf4cpp::parser { IRI IStreamQuadIterator::ImplXML::make_iri(std::string_view const iri, std::string_view const base) { if (base.empty()) { for (auto const &s : state_stack_ | std::ranges::views::reverse) { - auto const v = std::visit([](auto const &s) -> std::string_view { - return s.base; - }, - s); + std::string_view const v = s.get().base; if (!v.empty()) { state_->iri_factory.set_base_unchecked(v); break; @@ -483,10 +494,7 @@ namespace rdf4cpp::parser { } else { if (!lang_tag.has_value() || lang_tag->empty()) { for (auto const &s : state_stack_ | std::ranges::views::reverse) { - auto const v = std::visit([](auto const &s) -> std::string_view { - return s.lang_tag; - }, - s); + std::string_view const v = s.get().lang_tag; if (!v.empty()) { lang_tag = v; break; @@ -556,6 +564,9 @@ namespace rdf4cpp::parser { void IStreamQuadIterator::ImplXML::InitialState::on_end_element(ImplXML &i) { i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); } + void IStreamQuadIterator::ImplXML::InitialState::move_to(BaseState *b) noexcept { + new (b) InitialState(std::move(*this)); + } void IStreamQuadIterator::ImplXML::RDFState::on_characters(ImplXML &i, std::string_view const chars) { if (!trim_left(chars).empty()) { @@ -571,6 +582,9 @@ namespace rdf4cpp::parser { void IStreamQuadIterator::ImplXML::RDFState::on_end_element(ImplXML &i) { i.pop_state(); } + void IStreamQuadIterator::ImplXML::RDFState::move_to(BaseState *b) noexcept { + new (b) RDFState(std::move(*this)); + } void IStreamQuadIterator::ImplXML::DescriptionState::on_characters(ImplXML &i, std::string_view const chars) { if (!trim_left(chars).empty()) { @@ -636,17 +650,12 @@ namespace rdf4cpp::parser { i.add_statement(subject, predicate, *sub, reify); i.state_stack_.emplace_back(std::in_place_type_t{}); } else if (parse_resource) { - Node obj = i.make_bn(std::nullopt); + Node const obj = i.make_bn(std::nullopt); i.add_statement(subject, predicate, obj, reify); i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, obj); } else if (parse_literal) { auto &xml_state = i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); - std::visit([&](T &o) { - if constexpr (std::same_as) { - o.source_input(i); - } - }, - xml_state); + static_cast(xml_state.get()).source_input(i); // NOLINT(*-pro-type-static-cast-downcast) } else if (parse_collection) { i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); } else { @@ -658,6 +667,9 @@ namespace rdf4cpp::parser { void IStreamQuadIterator::ImplXML::DescriptionState::on_end_element(ImplXML &i) { i.pop_state(); } + void IStreamQuadIterator::ImplXML::DescriptionState::move_to(BaseState *b) noexcept { + new (b) DescriptionState(std::move(*this)); + } template void IStreamQuadIterator::ImplXML::DescriptionState::enter(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { @@ -747,6 +759,9 @@ namespace rdf4cpp::parser { } i.pop_state(); } + void IStreamQuadIterator::ImplXML::PredicateState::move_to(BaseState *b) noexcept { + new (b) PredicateState(std::move(*this)); + } bool IStreamQuadIterator::ImplXML::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(list_start_element, uri, local_name); @@ -763,6 +778,9 @@ namespace rdf4cpp::parser { } i.pop_state(); } + void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::move_to(BaseState *b) noexcept { + new (b) TypedLiteralPredicateState(std::move(*this)); + } void IStreamQuadIterator::ImplXML::XMLLiteralState::on_characters(ImplXML &i, [[maybe_unused]] std::string_view chars) { source_input(i); @@ -793,6 +811,9 @@ namespace rdf4cpp::parser { i.add_statement(subject, predicate, lit, reify); i.pop_state(); } + void IStreamQuadIterator::ImplXML::XMLLiteralState::move_to(BaseState *b) noexcept { + new (b) XMLLiteralState(std::move(*this)); + } void IStreamQuadIterator::ImplXML::XMLLiteralState::source_input(ImplXML &i) { xmlChar const *data; @@ -840,6 +861,9 @@ namespace rdf4cpp::parser { } i.pop_state(); } + void IStreamQuadIterator::ImplXML::CollectionState::move_to(BaseState *b) noexcept { + new (b) CollectionState(std::move(*this)); + } void IStreamQuadIterator::ImplXML::EmptyElement::on_characters(ImplXML &i, std::string_view const chars) { if (!trim_left(chars).empty()) { @@ -854,6 +878,9 @@ namespace rdf4cpp::parser { void IStreamQuadIterator::ImplXML::EmptyElement::on_end_element(ImplXML &i) { i.pop_state(); } + void IStreamQuadIterator::ImplXML::EmptyElement::move_to(BaseState *b) noexcept { + new (b) EmptyElement(std::move(*this)); + } IStreamQuadIterator::ImplXML::ImplXML(void *obj, ReadFunc const read, ErrorFunc const err, EOFFunc const eof, state_type *state) : handler_(make_sax_handler()), From 40c0f5a28c3b5bb8d79cecf75154ee220dc96bed Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 28 Nov 2025 15:16:19 +0100 Subject: [PATCH 28/42] doc --- private/rdf4cpp/parser/XMLParser.cpp | 34 ++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 84ed3435..fcb215d4 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -88,6 +88,11 @@ namespace rdf4cpp::parser { } }; + /** + * most states handle one or more of the elements in https://www.w3.org/TR/rdf11-xml/#section-Infoset-Grammar . + * note that the creation of a state is done by on_start_element of the previous state. + * each state holds information on base iri and language tag defined on the corresponding xml element. + */ struct BaseState { // NOLINT(*-special-member-functions) virtual ~BaseState() = default; virtual void on_characters(ImplXML &impl, std::string_view chars) = 0; @@ -112,6 +117,9 @@ namespace rdf4cpp::parser { static InheritedAttributeInfo get_inherited_attributes(ImplXML &impl, std::span attributes); }; + /** + * initial state, checks for start of https://www.w3.org/TR/rdf11-xml/#RDF + */ struct InitialState final : BaseState { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; @@ -123,6 +131,9 @@ namespace rdf4cpp::parser { } }; + /** + * state for https://www.w3.org/TR/rdf11-xml/#RDF + */ struct RDFState final : BaseState { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; @@ -134,6 +145,11 @@ namespace rdf4cpp::parser { using BaseState::BaseState; }; + /** + * state for https://www.w3.org/TR/rdf11-xml/#nodeElementList and https://www.w3.org/TR/rdf11-xml/#nodeElement + * on_start_element checks and dispatches for the different options in https://www.w3.org/TR/rdf11-xml/#propertyEltList + * (https://www.w3.org/TR/rdf11-xml/#parseTypeResourcePropertyElt has no own state, instead gets handled directly by on_start_element) + */ struct DescriptionState final : BaseState { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; @@ -157,6 +173,11 @@ namespace rdf4cpp::parser { static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; }; + /** + * state for https://www.w3.org/TR/rdf11-xml/#resourcePropertyElt (nested nodeElement / DescriptionState + * and https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (literal with no datatype attribute) + * and https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (with no attributes (empty literal)) + */ struct PredicateState : BaseState { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; @@ -183,6 +204,9 @@ namespace rdf4cpp::parser { static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); }; + /** + * state for https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (with datatype attribute) + */ struct TypedLiteralPredicateState final : PredicateState { void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; void on_end_element(ImplXML &i) override; @@ -197,6 +221,10 @@ namespace rdf4cpp::parser { static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; }; + /** + * state for https://www.w3.org/TR/rdf11-xml/#parseTypeLiteralPropertyElt + * (and https://www.w3.org/TR/rdf11-xml/#parseTypeOtherPropertyElt) + */ struct XMLLiteralState final : PredicateState { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; @@ -213,6 +241,9 @@ namespace rdf4cpp::parser { void source_input(ImplXML &i); }; + /** + * state for https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt + */ struct CollectionState final : BaseState { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; @@ -234,6 +265,9 @@ namespace rdf4cpp::parser { static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; }; + /** + * state for https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (if attributes are present) + */ struct EmptyElement final : BaseState { void on_characters(ImplXML &i, std::string_view chars) override; void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; From 7f4261f3711f2a15404ef6e15a53ac8f432e8eaf Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 3 Dec 2025 13:06:35 +0100 Subject: [PATCH 29/42] reorganize --- private/rdf4cpp/parser/XMLParser.cpp | 343 +-------------------- private/rdf4cpp/parser/XMLParser.hpp | 341 ++++++++++++++++++++ src/rdf4cpp/parser/IStreamQuadIterator.cpp | 3 +- src/rdf4cpp/parser/IStreamQuadIterator.hpp | 2 - 4 files changed, 344 insertions(+), 345 deletions(-) create mode 100644 private/rdf4cpp/parser/XMLParser.hpp diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index fcb215d4..911e6794 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -1,344 +1,8 @@ -#include -#include -#include - -#include - -#include - -#include -#include -#include -#include +#include #include -#include - -#include - -#include - namespace rdf4cpp::parser { - // not in a header, because that would complicate linking it against libxml - // probably even requiring to make libxml public - struct IStreamQuadIterator::ImplXML final : Impl { - private: - xmlSAXHandler handler_; - std::unique_ptr - context_; - void *reader_obj_; - ReadFunc read_func_; - ErrorFunc error_func_; - EOFFunc eof_func_; - std::deque result_queue_; - size_t next_bn_index_ = 0; - state_type *state_; - bool state_is_owned_ = false; - dice::sparse_map::sparse_set reserved_ids_; - - static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; - static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; - static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; - static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; - - static std::string_view from_xml_char(xmlChar const *s) { - if (s == nullptr) { - return ""; - } - // ReSharper disable once CppDFALocalValueEscapesFunction - return {reinterpret_cast(s)}; - } - - static std::string_view from_xml_char(xmlChar const *s, xmlChar const *e) { - if (s == nullptr) { - return ""; - } - // ReSharper disable once CppDFALocalValueEscapesFunction - return {reinterpret_cast(s), reinterpret_cast(e)}; - } - - static std::string_view from_xml_char(xmlChar const *s, int const n) { - if (s == nullptr) { - return ""; - } - // ReSharper disable once CppDFALocalValueEscapesFunction - return {reinterpret_cast(s), static_cast(n)}; - } - - struct Attribute { - xmlChar const *local_name_raw; - xmlChar const *prefix_raw; - xmlChar const *uri_raw; - xmlChar const *value_start_raw; - xmlChar const *value_end_raw; - - [[nodiscard]] std::string_view value() const { - return from_xml_char(value_start_raw, value_end_raw); - } - - [[nodiscard]] std::string_view local_name() const { - return from_xml_char(local_name_raw); - } - - [[nodiscard]] std::string_view uri() const { - return from_xml_char(uri_raw); - } - }; - - /** - * most states handle one or more of the elements in https://www.w3.org/TR/rdf11-xml/#section-Infoset-Grammar . - * note that the creation of a state is done by on_start_element of the previous state. - * each state holds information on base iri and language tag defined on the corresponding xml element. - */ - struct BaseState { // NOLINT(*-special-member-functions) - virtual ~BaseState() = default; - virtual void on_characters(ImplXML &impl, std::string_view chars) = 0; - virtual void on_start_element(ImplXML &impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; - virtual void on_end_element(ImplXML &impl) = 0; - virtual void move_to(BaseState *b) noexcept = 0; - - struct InheritedAttributeInfo { - std::string_view base = ""; - std::string_view lang_tag = ""; - }; - - std::string base; - std::string lang_tag; - - explicit BaseState(InheritedAttributeInfo const &i) - : base(i.base), lang_tag(i.lang_tag) { - } - - static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; - static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; - static InheritedAttributeInfo get_inherited_attributes(ImplXML &impl, std::span attributes); - }; - - /** - * initial state, checks for start of https://www.w3.org/TR/rdf11-xml/#RDF - */ - struct InitialState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - InitialState() - : BaseState({}) { - } - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#RDF - */ - struct RDFState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; - - using BaseState::BaseState; - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#nodeElementList and https://www.w3.org/TR/rdf11-xml/#nodeElement - * on_start_element checks and dispatches for the different options in https://www.w3.org/TR/rdf11-xml/#propertyEltList - * (https://www.w3.org/TR/rdf11-xml/#parseTypeResourcePropertyElt has no own state, instead gets handled directly by on_start_element) - */ - struct DescriptionState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - Node subject; - size_t list_current = 1; - - explicit DescriptionState(InheritedAttributeInfo const &i, Node sub) - : BaseState(i), subject(sub) { - } - - template - static void enter(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes, F f); - - static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; - static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; - static constexpr std::string_view id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"; - static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; - static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#resourcePropertyElt (nested nodeElement / DescriptionState - * and https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (literal with no datatype attribute) - * and https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (with no attributes (empty literal)) - */ - struct PredicateState : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - Node subject; - IRI predicate; - IRI reify; - std::string literal; - bool done = false; - - PredicateState(InheritedAttributeInfo const &i, Node sub, IRI predicate, IRI reify) - : BaseState(i), subject(sub), predicate(predicate), reify(reify) { - } - - static constexpr std::string_view resource_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource"; - static constexpr std::string_view parse_type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType"; - static constexpr std::string_view parse_type_resource = "Resource"; - static constexpr std::string_view parse_type_literal = "Literal"; - static constexpr std::string_view parse_type_collection = "Collection"; - static constexpr std::string_view list_start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li"; - - static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (with datatype attribute) - */ - struct TypedLiteralPredicateState final : PredicateState { - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - IRI datatype; - - TypedLiteralPredicateState(InheritedAttributeInfo const &i, Node iri, IRI predicate, IRI reify, IRI datatype) - : PredicateState(i, iri, predicate, reify), datatype(datatype) { - } - - static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#parseTypeLiteralPropertyElt - * (and https://www.w3.org/TR/rdf11-xml/#parseTypeOtherPropertyElt) - */ - struct XMLLiteralState final : PredicateState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - size_t depth = 0; - size_t data_start = 0; - size_t last_offset = 0; - size_t last_size = 0; - - using PredicateState::PredicateState; - - void source_input(ImplXML &i); - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt - */ - struct CollectionState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - Node subject; - IRI predicate; - Node last_bn = Node::make_null(); - IRI reify; - bool first = true; - - CollectionState(InheritedAttributeInfo const &i, Node sub, IRI pred, IRI reify) - : BaseState(i), subject(sub), predicate(pred), reify(reify) { - } - - static constexpr std::string_view iri_nil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; - static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; - static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (if attributes are present) - */ - struct EmptyElement final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - EmptyElement() - : BaseState({}) { - } - }; - - BaseState *current_state_ = nullptr; - std::vector> state_stack_; - - static xmlSAXHandler make_sax_handler(); - - void add_error(ParsingError::Type ty, std::string msg); - /** - * add statement to the output list, if none of the components is null - * (null is used to track an already inserted parse error for that component) - */ - void add_statement(Node subject, IRI predicate, Node object, IRI reify); - void update_current_state(); - void pop_state(); - /** - * removes whitespace according to xml spec - */ - [[nodiscard]] static std::string_view trim_left(std::string_view v); - [[nodiscard]] static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); - [[nodiscard]] static bool iri_reserved(std::string_view uri, std::string_view local_name); - template - [[nodiscard]] NT inspect_node(NT node); - [[nodiscard]] IRI make_iri(std::string_view iri, std::string_view base); - [[nodiscard]] IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base); - /** - * create the IRI for an id_attrib, including uniqueness check - */ - [[nodiscard]] IRI make_id(std::string_view local_name, std::string_view base); - /** - * create an IRI with no checks, intended for hardcoded IRIs like reify_subject - */ - [[nodiscard]] IRI make_hardcoded_iri(std::string_view iri) const; - [[nodiscard]] IRI make_type_iri() const; - [[nodiscard]] Node make_bn(std::optional name); - /** - * creates a literal - * @param value - * @param datatype - * @param lang_tag (ignored, if datatype is set) - * @return - */ - [[nodiscard]] Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag); - - static void on_error(void *th, char const *msg, ...); - - public: - ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); - ~ImplXML() override; - - ImplXML(ImplXML const &) = delete; - ImplXML &operator=(ImplXML const &) = delete; - ImplXML(ImplXML &&) = delete; - ImplXML &operator=(ImplXML &&) = delete; - - [[nodiscard]] std::optional next() override; - - [[nodiscard]] uint64_t current_line() const noexcept override; - [[nodiscard]] uint64_t current_column() const noexcept override; - }; - - xmlSAXHandler IStreamQuadIterator::ImplXML::make_sax_handler() { xmlSAXHandler r{}; std::memset(&r, 0, sizeof(xmlSAXHandler)); @@ -960,9 +624,4 @@ namespace rdf4cpp::parser { uint64_t IStreamQuadIterator::ImplXML::current_column() const noexcept { return xmlSAX2GetColumnNumber(context_.get()); } - - - std::unique_ptr IStreamQuadIterator::make_xml_impl(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state) { - return std::make_unique(obj, read, err, eof, state); - } } // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp new file mode 100644 index 00000000..23a6d853 --- /dev/null +++ b/private/rdf4cpp/parser/XMLParser.hpp @@ -0,0 +1,341 @@ +#ifndef RDF4CPP_XMLPARSER_H +#define RDF4CPP_XMLPARSER_H + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +namespace rdf4cpp::parser { + struct IStreamQuadIterator::ImplXML final : Impl { + private: + xmlSAXHandler handler_; + std::unique_ptr + context_; + void *reader_obj_; + ReadFunc read_func_; + ErrorFunc error_func_; + EOFFunc eof_func_; + std::deque result_queue_; + size_t next_bn_index_ = 0; + state_type *state_; + bool state_is_owned_ = false; + dice::sparse_map::sparse_set reserved_ids_; + + static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; + static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; + static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; + static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; + + static std::string_view from_xml_char(xmlChar const *s) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s)}; + } + + static std::string_view from_xml_char(xmlChar const *s, xmlChar const *e) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s), reinterpret_cast(e)}; + } + + static std::string_view from_xml_char(xmlChar const *s, int const n) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s), static_cast(n)}; + } + + struct Attribute { + xmlChar const *local_name_raw; + xmlChar const *prefix_raw; + xmlChar const *uri_raw; + xmlChar const *value_start_raw; + xmlChar const *value_end_raw; + + [[nodiscard]] std::string_view value() const { + return from_xml_char(value_start_raw, value_end_raw); + } + + [[nodiscard]] std::string_view local_name() const { + return from_xml_char(local_name_raw); + } + + [[nodiscard]] std::string_view uri() const { + return from_xml_char(uri_raw); + } + }; + + /** + * most states handle one or more of the elements in https://www.w3.org/TR/rdf11-xml/#section-Infoset-Grammar . + * note that the creation of a state is done by on_start_element of the previous state. + * each state holds information on base iri and language tag defined on the corresponding xml element. + */ + struct BaseState { // NOLINT(*-special-member-functions) + virtual ~BaseState() = default; + virtual void on_characters(ImplXML &impl, std::string_view chars) = 0; + virtual void on_start_element(ImplXML &impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; + virtual void on_end_element(ImplXML &impl) = 0; + virtual void move_to(BaseState *b) noexcept = 0; + + struct InheritedAttributeInfo { + std::string_view base = ""; + std::string_view lang_tag = ""; + }; + + std::string base; + std::string lang_tag; + + explicit BaseState(InheritedAttributeInfo const &i) + : base(i.base), lang_tag(i.lang_tag) { + } + + static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; + static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; + static InheritedAttributeInfo get_inherited_attributes(ImplXML &impl, std::span attributes); + }; + + /** + * initial state, checks for start of https://www.w3.org/TR/rdf11-xml/#RDF + */ + struct InitialState final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + InitialState() + : BaseState({}) { + } + }; + + /** + * state for https://www.w3.org/TR/rdf11-xml/#RDF + */ + struct RDFState final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; + + using BaseState::BaseState; + }; + + /** + * state for https://www.w3.org/TR/rdf11-xml/#nodeElementList and https://www.w3.org/TR/rdf11-xml/#nodeElement + * on_start_element checks and dispatches for the different options in https://www.w3.org/TR/rdf11-xml/#propertyEltList + * (https://www.w3.org/TR/rdf11-xml/#parseTypeResourcePropertyElt has no own state, instead gets handled directly by on_start_element) + */ + struct DescriptionState final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + Node subject; + size_t list_current = 1; + + explicit DescriptionState(InheritedAttributeInfo const &i, Node sub) + : BaseState(i), subject(sub) { + } + + template + static void enter(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes, F f); + + static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; + static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; + static constexpr std::string_view id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"; + static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; + static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; + }; + + /** + * state for https://www.w3.org/TR/rdf11-xml/#resourcePropertyElt (nested nodeElement / DescriptionState + * and https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (literal with no datatype attribute) + * and https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (with no attributes (empty literal)) + */ + struct PredicateState : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + Node subject; + IRI predicate; + IRI reify; + std::string literal; + bool done = false; + + PredicateState(InheritedAttributeInfo const &i, Node sub, IRI predicate, IRI reify) + : BaseState(i), subject(sub), predicate(predicate), reify(reify) { + } + + static constexpr std::string_view resource_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource"; + static constexpr std::string_view parse_type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType"; + static constexpr std::string_view parse_type_resource = "Resource"; + static constexpr std::string_view parse_type_literal = "Literal"; + static constexpr std::string_view parse_type_collection = "Collection"; + static constexpr std::string_view list_start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li"; + + static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); + }; + + /** + * state for https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (with datatype attribute) + */ + struct TypedLiteralPredicateState final : PredicateState { + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + IRI datatype; + + TypedLiteralPredicateState(InheritedAttributeInfo const &i, Node iri, IRI predicate, IRI reify, IRI datatype) + : PredicateState(i, iri, predicate, reify), datatype(datatype) { + } + + static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; + }; + + /** + * state for https://www.w3.org/TR/rdf11-xml/#parseTypeLiteralPropertyElt + * (and https://www.w3.org/TR/rdf11-xml/#parseTypeOtherPropertyElt) + */ + struct XMLLiteralState final : PredicateState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + size_t depth = 0; + size_t data_start = 0; + size_t last_offset = 0; + size_t last_size = 0; + + using PredicateState::PredicateState; + + void source_input(ImplXML &i); + }; + + /** + * state for https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt + */ + struct CollectionState final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + Node subject; + IRI predicate; + Node last_bn = Node::make_null(); + IRI reify; + bool first = true; + + CollectionState(InheritedAttributeInfo const &i, Node sub, IRI pred, IRI reify) + : BaseState(i), subject(sub), predicate(pred), reify(reify) { + } + + static constexpr std::string_view iri_nil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; + static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; + static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; + }; + + /** + * state for https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (if attributes are present) + */ + struct EmptyElement final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + EmptyElement() + : BaseState({}) { + } + }; + + BaseState *current_state_ = nullptr; + std::vector> state_stack_; + + static xmlSAXHandler make_sax_handler(); + + void add_error(ParsingError::Type ty, std::string msg); + /** + * add statement to the output list, if none of the components is null + * (null is used to track an already inserted parse error for that component) + */ + void add_statement(Node subject, IRI predicate, Node object, IRI reify); + void update_current_state(); + void pop_state(); + /** + * removes whitespace according to xml spec + */ + [[nodiscard]] static std::string_view trim_left(std::string_view v); + [[nodiscard]] static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); + [[nodiscard]] static bool iri_reserved(std::string_view uri, std::string_view local_name); + template + [[nodiscard]] NT inspect_node(NT node); + [[nodiscard]] IRI make_iri(std::string_view iri, std::string_view base); + [[nodiscard]] IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base); + /** + * create the IRI for an id_attrib, including uniqueness check + */ + [[nodiscard]] IRI make_id(std::string_view local_name, std::string_view base); + /** + * create an IRI with no checks, intended for hardcoded IRIs like reify_subject + */ + [[nodiscard]] IRI make_hardcoded_iri(std::string_view iri) const; + [[nodiscard]] IRI make_type_iri() const; + [[nodiscard]] Node make_bn(std::optional name); + /** + * creates a literal + * @param value + * @param datatype + * @param lang_tag (ignored, if datatype is set) + * @return + */ + [[nodiscard]] Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag); + + static void on_error(void *th, char const *msg, ...); + + public: + ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); + ~ImplXML() override; + + ImplXML(ImplXML const &) = delete; + ImplXML &operator=(ImplXML const &) = delete; + ImplXML(ImplXML &&) = delete; + ImplXML &operator=(ImplXML &&) = delete; + + [[nodiscard]] std::optional next() override; + + [[nodiscard]] uint64_t current_line() const noexcept override; + [[nodiscard]] uint64_t current_column() const noexcept override; + }; +} + +#endif //RDF4CPP_XMLPARSER_H diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.cpp b/src/rdf4cpp/parser/IStreamQuadIterator.cpp index 5c3e9aae..1abc99c9 100644 --- a/src/rdf4cpp/parser/IStreamQuadIterator.cpp +++ b/src/rdf4cpp/parser/IStreamQuadIterator.cpp @@ -1,6 +1,7 @@ #include "IStreamQuadIterator.hpp" #include +#include #include @@ -57,7 +58,7 @@ IStreamQuadIterator::IStreamQuadIterator(void *stream, flags_type flags, state_type *state) : impl{flags.get_syntax() == ParsingFlag::RdfXml ? - make_xml_impl(stream, read, error, eof, state) : + static_cast>(std::make_unique(stream, read, error, eof, state)) : std::make_unique(stream, read, error, flags, state)}, cur{impl->next()} { } diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.hpp b/src/rdf4cpp/parser/IStreamQuadIterator.hpp index d8b00502..9ab9092d 100644 --- a/src/rdf4cpp/parser/IStreamQuadIterator.hpp +++ b/src/rdf4cpp/parser/IStreamQuadIterator.hpp @@ -106,8 +106,6 @@ struct IStreamQuadIterator { std::unique_ptr impl; std::optional> cur; - - static std::unique_ptr make_xml_impl(void* obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type* state); public: /** * Constructs a IStreamQuadIterator from a C-like io api. That is something similar to From c40d4dd1cbbb05052f110be77e24f517d65ee6b1 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 3 Dec 2025 14:34:25 +0100 Subject: [PATCH 30/42] separate states --- CMakeLists.txt | 9 + private/rdf4cpp/parser/XMLParser.cpp | 353 +----------------- private/rdf4cpp/parser/XMLParser.hpp | 257 ++----------- .../parser/XMLParserStateCollector.hpp | 90 +++++ .../parser/XMLStates/XMLParserBaseState.cpp | 18 + .../parser/XMLStates/XMLParserBaseState.hpp | 37 ++ .../XMLStates/XMLParserCollectionState.cpp | 39 ++ .../XMLStates/XMLParserCollectionState.hpp | 32 ++ .../XMLStates/XMLParserDescriptionState.cpp | 88 +++++ .../XMLStates/XMLParserDescriptionState.hpp | 40 ++ .../XMLParserDescriptionStateEnter.hpp | 59 +++ .../XMLStates/XMLParserEmptyElement.cpp | 20 + .../XMLStates/XMLParserEmptyElement.hpp | 22 ++ .../XMLStates/XMLParserInitialState.cpp | 25 ++ .../XMLStates/XMLParserInitialState.hpp | 23 ++ .../XMLStates/XMLParserPredicateState.cpp | 47 +++ .../XMLStates/XMLParserPredicateState.hpp | 40 ++ .../parser/XMLStates/XMLParserRDFState.cpp | 23 ++ .../parser/XMLStates/XMLParserRDFState.hpp | 23 ++ .../XMLParserTypedLiteralPredicateState.cpp | 18 + .../XMLParserTypedLiteralPredicateState.hpp | 26 ++ .../XMLStates/XMLParserXMLLiteralState.cpp | 52 +++ .../XMLStates/XMLParserXMLLiteralState.hpp | 29 ++ src/rdf4cpp/parser/IStreamQuadIterator.hpp | 1 + 24 files changed, 784 insertions(+), 587 deletions(-) create mode 100644 private/rdf4cpp/parser/XMLParserStateCollector.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserDescriptionStateEnter.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp create mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ca0ae7c..6e49d6d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,6 +151,15 @@ add_library(rdf4cpp src/rdf4cpp/util/Anonymizer.cpp private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp private/rdf4cpp/parser/XMLParser.cpp + private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp + private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp + private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp + private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp + private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp + private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp + private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp + private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp + private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp private/rdf4cpp/regex/RegexImpl.cpp private/rdf4cpp/regex/RegexReplacerImpl.cpp ${serd_source_files} diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 911e6794..c42e4edf 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -229,360 +229,9 @@ namespace rdf4cpp::parser { va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } - IStreamQuadIterator::ImplXML::BaseState::InheritedAttributeInfo IStreamQuadIterator::ImplXML::BaseState::get_inherited_attributes(ImplXML &impl, std::span const attributes) { - InheritedAttributeInfo r{}; - for (auto const &a : attributes) { - if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) { - if (auto e = IRIView(a.value()).quick_validate(); e != IRIFactoryError::Ok) { - impl.add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", e, a.value())); - } - r.base = a.value(); - } else if (iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) { - r.lang_tag = a.value(); - } - } - return r; - } - - void IStreamQuadIterator::ImplXML::InitialState::on_characters(ImplXML &i, std::string_view const chars) { - if (!trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); - } - } - - void IStreamQuadIterator::ImplXML::InitialState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { - if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { - i.state_stack_.emplace_back(std::in_place_type_t{}, get_inherited_attributes(i, attributes)); - i.update_current_state(); - return; - } - i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); - } - - void IStreamQuadIterator::ImplXML::InitialState::on_end_element(ImplXML &i) { - i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); - } - void IStreamQuadIterator::ImplXML::InitialState::move_to(BaseState *b) noexcept { - new (b) InitialState(std::move(*this)); - } - - void IStreamQuadIterator::ImplXML::RDFState::on_characters(ImplXML &i, std::string_view const chars) { - if (!trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected Description, found characters"); - } - } - - void IStreamQuadIterator::ImplXML::RDFState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - DescriptionState::enter(i, local_name, uri, attributes, [](auto) { - }); - } - - void IStreamQuadIterator::ImplXML::RDFState::on_end_element(ImplXML &i) { - i.pop_state(); - } - void IStreamQuadIterator::ImplXML::RDFState::move_to(BaseState *b) noexcept { - new (b) RDFState(std::move(*this)); - } - - void IStreamQuadIterator::ImplXML::DescriptionState::on_characters(ImplXML &i, std::string_view const chars) { - if (!trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters"); - } - } - - void IStreamQuadIterator::ImplXML::DescriptionState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span attributes) { - auto const inherited_attribute_info = get_inherited_attributes(i, attributes); - IRI predicate; - if (iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { - predicate = i.make_iri(std::format("http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}", list_current++), inherited_attribute_info.base); - } else { - predicate = i.make_iri(uri, local_name, inherited_attribute_info.base); - } - std::optional datatype = std::nullopt; - std::optional sub = std::nullopt; - IRI reify = IRI::make_null(); - bool parse_resource = false; - bool parse_literal = false; - bool parse_collection = false; - for (auto const &att : attributes) { - if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { - datatype = i.make_iri(att.value(), inherited_attribute_info.base); - } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { - sub = i.make_iri(att.value(), inherited_attribute_info.base); - } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { - sub = i.make_bn(att.value()); - } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { - reify = i.make_id(att.value(), inherited_attribute_info.base); - } else if (iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { - if (att.value() == PredicateState::parse_type_resource) { - parse_resource = true; - } else if (att.value() == PredicateState::parse_type_collection) { - parse_collection = true; - } else { - parse_literal = true; - } - } - } - for (auto const &att : attributes) { - if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { - continue; - } - if (!sub.has_value()) { - sub = i.make_bn(std::nullopt); - } - if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = i.make_iri(att.value(), base); - i.add_statement(*sub, i.make_type_iri(), obj, IRI::make_null()); - } else { - IRI const pred = i.make_iri(att.uri(), att.local_name(), base); - Literal const obj = i.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag); - i.add_statement(*sub, pred, obj, IRI::make_null()); - } - } - if (sub.has_value() && (parse_collection || parse_literal || parse_resource)) { - i.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); - } - if (datatype.has_value()) { - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, *datatype); - } else if (sub.has_value()) { - i.add_statement(subject, predicate, *sub, reify); - i.state_stack_.emplace_back(std::in_place_type_t{}); - } else if (parse_resource) { - Node const obj = i.make_bn(std::nullopt); - i.add_statement(subject, predicate, obj, reify); - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, obj); - } else if (parse_literal) { - auto &xml_state = i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); - static_cast(xml_state.get()).source_input(i); // NOLINT(*-pro-type-static-cast-downcast) - } else if (parse_collection) { - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); - } else { - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); - } - i.update_current_state(); - } - - void IStreamQuadIterator::ImplXML::DescriptionState::on_end_element(ImplXML &i) { - i.pop_state(); - } - void IStreamQuadIterator::ImplXML::DescriptionState::move_to(BaseState *b) noexcept { - new (b) DescriptionState(std::move(*this)); - } - - template - void IStreamQuadIterator::ImplXML::DescriptionState::enter(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { - auto const inherited_attribute_info = get_inherited_attributes(i, attributes); - Node sub = Node::make_null(); - auto check_only_one = [&sub, &i]() { - if (!sub.null()) { - i.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:ID, rdf:about, and rdf:nodeID"); - return true; - } - return false; - }; - for (auto const &att : attributes) { - if (iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { - if (check_only_one()) { - continue; - } - sub = i.make_iri(att.value(), inherited_attribute_info.base); - } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { - if (check_only_one()) { - continue; - } - sub = i.make_id(att.value(), inherited_attribute_info.base); - } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { - if (check_only_one()) { - continue; - } - sub = i.make_bn(att.value()); - } - } - if (sub.null()) { - sub = i.make_bn(std::nullopt); - } - if (!iri_equal_pieces(start_element, uri, local_name)) { - IRI const obj = i.make_iri(uri, local_name, inherited_attribute_info.base); - if (!obj.null()) { - i.add_statement(sub, i.make_type_iri(), obj, IRI::make_null()); - } - } - for (auto const &att : attributes) { - if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { - continue; - } - if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = i.make_iri(att.value(), inherited_attribute_info.base); - i.add_statement(sub, i.make_type_iri(), obj, IRI::make_null()); - } else { - IRI const pred = i.make_iri(att.uri(), att.local_name(), inherited_attribute_info.base); - Literal const obj = i.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag); - i.add_statement(sub, pred, obj, IRI::make_null()); - } - } - f(sub); - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, sub); - i.update_current_state(); - } - - void IStreamQuadIterator::ImplXML::PredicateState::on_characters([[maybe_unused]] ImplXML &i, std::string_view const chars) { - if (done) { - if (!trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal"); - } - return; - } - literal.append(chars); - } - - void IStreamQuadIterator::ImplXML::PredicateState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - if (!trim_left(literal).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element"); - return; - } - if (done) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found element"); - return; - } - DescriptionState::enter(i, local_name, uri, attributes, [&](Node obj) { - done = true; - i.add_statement(subject, predicate, obj, reify); - }); - } - - void IStreamQuadIterator::ImplXML::PredicateState::on_end_element(ImplXML &i) { - if (!done) { - Literal const lit = i.make_literal(literal, std::nullopt, std::nullopt); - i.add_statement(subject, predicate, lit, reify); - } - i.pop_state(); - } - void IStreamQuadIterator::ImplXML::PredicateState::move_to(BaseState *b) noexcept { - new (b) PredicateState(std::move(*this)); - } - - bool IStreamQuadIterator::ImplXML::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { - return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(list_start_element, uri, local_name); - } - - void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { - i.add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); - } - - void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_end_element(ImplXML &i) { - if (!datatype.null()) { - Literal const lit = i.make_literal(literal, datatype, std::nullopt); - i.add_statement(subject, predicate, lit, reify); - } - i.pop_state(); - } - void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::move_to(BaseState *b) noexcept { - new (b) TypedLiteralPredicateState(std::move(*this)); - } - - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_characters(ImplXML &i, [[maybe_unused]] std::string_view chars) { - source_input(i); - } - - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { - ++depth; - source_input(i); - } - - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_end_element(ImplXML &i) { - if (depth > 0) { - --depth; - source_input(i); - return; - } - IRI datatype = i.make_hardcoded_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"); - std::string_view l = literal; - l = l.substr(0, last_offset); - l.remove_prefix(data_start); - if (!l.empty() && l[0] == '/') { - l.remove_prefix(1); - } - if (!l.empty() && l[0] == '>') { - l.remove_prefix(1); - } - Literal const lit = i.make_literal(l, datatype, std::nullopt); - i.add_statement(subject, predicate, lit, reify); - i.pop_state(); - } - void IStreamQuadIterator::ImplXML::XMLLiteralState::move_to(BaseState *b) noexcept { - new (b) XMLLiteralState(std::move(*this)); - } - - void IStreamQuadIterator::ImplXML::XMLLiteralState::source_input(ImplXML &i) { - xmlChar const *data; - int size = 1024; - int off = 0; - xmlCtxtGetInputWindow(i.context_.get(), 0, &data, &size, &off); - std::string_view const sv{reinterpret_cast(data), static_cast(size)}; - if (literal.empty()) { - data_start = off; - } - if (!static_cast(literal).ends_with(sv)) { - last_size = literal.size(); - literal += sv; - } - last_offset = static_cast(off) + last_size; - } - - void IStreamQuadIterator::ImplXML::CollectionState::on_characters(ImplXML &i, std::string_view const chars) { - if (!trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected element, found characters"); - } - } - - void IStreamQuadIterator::ImplXML::CollectionState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - DescriptionState::enter(i, local_name, uri, attributes, [&](Node const obj) { - if (first) { - first = false; - last_bn = i.make_bn(std::nullopt); - i.add_statement(subject, predicate, last_bn, reify); - } else { - auto const bn = i.make_bn(std::nullopt); - i.add_statement(last_bn, i.make_hardcoded_iri(iri_rest), bn, IRI::make_null()); - last_bn = bn; - } - i.add_statement(last_bn, i.make_hardcoded_iri(iri_first), obj, IRI::make_null()); - }); - } - - void IStreamQuadIterator::ImplXML::CollectionState::on_end_element(ImplXML &i) { - auto const nil = i.make_hardcoded_iri(iri_nil); - if (first) { - i.add_statement(subject, predicate, nil, reify); - } else { - i.add_statement(last_bn, i.make_hardcoded_iri(iri_rest), nil, IRI::make_null()); - } - i.pop_state(); - } - void IStreamQuadIterator::ImplXML::CollectionState::move_to(BaseState *b) noexcept { - new (b) CollectionState(std::move(*this)); - } - - void IStreamQuadIterator::ImplXML::EmptyElement::on_characters(ImplXML &i, std::string_view const chars) { - if (!trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); - } - } - - void IStreamQuadIterator::ImplXML::EmptyElement::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???"); - } - - void IStreamQuadIterator::ImplXML::EmptyElement::on_end_element(ImplXML &i) { - i.pop_state(); - } - void IStreamQuadIterator::ImplXML::EmptyElement::move_to(BaseState *b) noexcept { - new (b) EmptyElement(std::move(*this)); - } - IStreamQuadIterator::ImplXML::ImplXML(void *obj, ReadFunc const read, ErrorFunc const err, EOFFunc const eof, state_type *state) : handler_(make_sax_handler()), - context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "mem")), + context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "rdf/xml")), reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof), state_(state) { xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp index 23a6d853..f4670ac5 100644 --- a/private/rdf4cpp/parser/XMLParser.hpp +++ b/private/rdf4cpp/parser/XMLParser.hpp @@ -20,8 +20,20 @@ #include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + namespace rdf4cpp::parser { - struct IStreamQuadIterator::ImplXML final : Impl { + struct IStreamQuadIterator::ImplXML final : ImplXMLStateCollector { private: xmlSAXHandler handler_; std::unique_ptr(s)}; - } - - static std::string_view from_xml_char(xmlChar const *s, xmlChar const *e) { - if (s == nullptr) { - return ""; - } - // ReSharper disable once CppDFALocalValueEscapesFunction - return {reinterpret_cast(s), reinterpret_cast(e)}; - } - - static std::string_view from_xml_char(xmlChar const *s, int const n) { - if (s == nullptr) { - return ""; - } - // ReSharper disable once CppDFALocalValueEscapesFunction - return {reinterpret_cast(s), static_cast(n)}; - } - - struct Attribute { - xmlChar const *local_name_raw; - xmlChar const *prefix_raw; - xmlChar const *uri_raw; - xmlChar const *value_start_raw; - xmlChar const *value_end_raw; - - [[nodiscard]] std::string_view value() const { - return from_xml_char(value_start_raw, value_end_raw); - } - - [[nodiscard]] std::string_view local_name() const { - return from_xml_char(local_name_raw); - } - - [[nodiscard]] std::string_view uri() const { - return from_xml_char(uri_raw); - } - }; - - /** - * most states handle one or more of the elements in https://www.w3.org/TR/rdf11-xml/#section-Infoset-Grammar . - * note that the creation of a state is done by on_start_element of the previous state. - * each state holds information on base iri and language tag defined on the corresponding xml element. - */ - struct BaseState { // NOLINT(*-special-member-functions) - virtual ~BaseState() = default; - virtual void on_characters(ImplXML &impl, std::string_view chars) = 0; - virtual void on_start_element(ImplXML &impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; - virtual void on_end_element(ImplXML &impl) = 0; - virtual void move_to(BaseState *b) noexcept = 0; - - struct InheritedAttributeInfo { - std::string_view base = ""; - std::string_view lang_tag = ""; - }; - - std::string base; - std::string lang_tag; - - explicit BaseState(InheritedAttributeInfo const &i) - : base(i.base), lang_tag(i.lang_tag) { - } - - static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; - static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; - static InheritedAttributeInfo get_inherited_attributes(ImplXML &impl, std::span attributes); - }; - - /** - * initial state, checks for start of https://www.w3.org/TR/rdf11-xml/#RDF - */ - struct InitialState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - InitialState() - : BaseState({}) { - } - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#RDF - */ - struct RDFState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; - - using BaseState::BaseState; - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#nodeElementList and https://www.w3.org/TR/rdf11-xml/#nodeElement - * on_start_element checks and dispatches for the different options in https://www.w3.org/TR/rdf11-xml/#propertyEltList - * (https://www.w3.org/TR/rdf11-xml/#parseTypeResourcePropertyElt has no own state, instead gets handled directly by on_start_element) - */ - struct DescriptionState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - Node subject; - size_t list_current = 1; - - explicit DescriptionState(InheritedAttributeInfo const &i, Node sub) - : BaseState(i), subject(sub) { - } - - template - static void enter(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes, F f); - - static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; - static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; - static constexpr std::string_view id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"; - static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; - static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#resourcePropertyElt (nested nodeElement / DescriptionState - * and https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (literal with no datatype attribute) - * and https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (with no attributes (empty literal)) - */ - struct PredicateState : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - Node subject; - IRI predicate; - IRI reify; - std::string literal; - bool done = false; - - PredicateState(InheritedAttributeInfo const &i, Node sub, IRI predicate, IRI reify) - : BaseState(i), subject(sub), predicate(predicate), reify(reify) { - } - - static constexpr std::string_view resource_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource"; - static constexpr std::string_view parse_type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType"; - static constexpr std::string_view parse_type_resource = "Resource"; - static constexpr std::string_view parse_type_literal = "Literal"; - static constexpr std::string_view parse_type_collection = "Collection"; - static constexpr std::string_view list_start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li"; - - static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (with datatype attribute) - */ - struct TypedLiteralPredicateState final : PredicateState { - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - IRI datatype; - - TypedLiteralPredicateState(InheritedAttributeInfo const &i, Node iri, IRI predicate, IRI reify, IRI datatype) - : PredicateState(i, iri, predicate, reify), datatype(datatype) { - } - - static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#parseTypeLiteralPropertyElt - * (and https://www.w3.org/TR/rdf11-xml/#parseTypeOtherPropertyElt) - */ - struct XMLLiteralState final : PredicateState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - size_t depth = 0; - size_t data_start = 0; - size_t last_offset = 0; - size_t last_size = 0; - - using PredicateState::PredicateState; - - void source_input(ImplXML &i); - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt - */ - struct CollectionState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - Node subject; - IRI predicate; - Node last_bn = Node::make_null(); - IRI reify; - bool first = true; - - CollectionState(InheritedAttributeInfo const &i, Node sub, IRI pred, IRI reify) - : BaseState(i), subject(sub), predicate(pred), reify(reify) { - } - - static constexpr std::string_view iri_nil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; - static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; - static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; - }; - - /** - * state for https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (if attributes are present) - */ - struct EmptyElement final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; - void move_to(BaseState *b) noexcept override; - - EmptyElement() - : BaseState({}) { - } - }; + friend struct BaseState; + friend struct InitialState; + friend struct RDFState; + friend struct DescriptionState; + friend struct PredicateState; + friend struct TypedLiteralPredicateState; + friend struct EmptyElement; + friend struct XMLLiteralState; + friend struct CollectionState; BaseState *current_state_ = nullptr; std::vector> state_stack_; diff --git a/private/rdf4cpp/parser/XMLParserStateCollector.hpp b/private/rdf4cpp/parser/XMLParserStateCollector.hpp new file mode 100644 index 00000000..00dc3c95 --- /dev/null +++ b/private/rdf4cpp/parser/XMLParserStateCollector.hpp @@ -0,0 +1,90 @@ +#ifndef RDF4CPP_XMLPARSERSTATECOLLECTOR_H +#define RDF4CPP_XMLPARSERSTATECOLLECTOR_H + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +namespace rdf4cpp::parser { + struct IStreamQuadIterator::ImplXMLStateCollector : Impl { + protected: + static std::string_view from_xml_char(xmlChar const *s) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s)}; + } + + static std::string_view from_xml_char(xmlChar const *s, xmlChar const *e) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s), reinterpret_cast(e)}; + } + + static std::string_view from_xml_char(xmlChar const *s, int const n) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s), static_cast(n)}; + } + + struct Attribute { + xmlChar const *local_name_raw; + xmlChar const *prefix_raw; + xmlChar const *uri_raw; + xmlChar const *value_start_raw; + xmlChar const *value_end_raw; + + [[nodiscard]] std::string_view value() const { + return from_xml_char(value_start_raw, value_end_raw); + } + + [[nodiscard]] std::string_view local_name() const { + return from_xml_char(local_name_raw); + } + + [[nodiscard]] std::string_view uri() const { + return from_xml_char(uri_raw); + } + }; + + struct BaseState; + + struct InitialState; + + struct RDFState; + + struct DescriptionState; + + struct PredicateState; + + struct TypedLiteralPredicateState; + + struct XMLLiteralState; + + struct CollectionState; + + struct EmptyElement; + }; +} // namespace rdf4cpp::parser + +#endif //RDF4CPP_XMLPARSERSTATECOLLECTOR_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp new file mode 100644 index 00000000..f9fda204 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp @@ -0,0 +1,18 @@ +#include + +namespace rdf4cpp::parser { + IStreamQuadIterator::ImplXML::BaseState::InheritedAttributeInfo IStreamQuadIterator::ImplXML::BaseState::get_inherited_attributes(ImplXML &impl, std::span const attributes) { + InheritedAttributeInfo r{}; + for (auto const &a : attributes) { + if (ImplXML::iri_equal_pieces(base_attribute, a.uri(), a.local_name())) { + if (auto e = IRIView(a.value()).quick_validate(); e != IRIFactoryError::Ok) { + impl.add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", e, a.value())); + } + r.base = a.value(); + } else if (ImplXML::iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) { + r.lang_tag = a.value(); + } + } + return r; + } +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp new file mode 100644 index 00000000..84f9dfbf --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp @@ -0,0 +1,37 @@ +#ifndef RDF4CPP_XMLPARSERBASESTATE_H +#define RDF4CPP_XMLPARSERBASESTATE_H + +#include + +namespace rdf4cpp::parser { + /** + * most states handle one or more of the elements in https://www.w3.org/TR/rdf11-xml/#section-Infoset-Grammar . + * note that the creation of a state is done by on_start_element of the previous state. + * each state holds information on base iri and language tag defined on the corresponding xml element. + */ + struct IStreamQuadIterator::ImplXMLStateCollector::BaseState { // NOLINT(*-special-member-functions) + virtual ~BaseState() = default; + virtual void on_characters(ImplXML &impl, std::string_view chars) = 0; + virtual void on_start_element(ImplXML &impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; + virtual void on_end_element(ImplXML &impl) = 0; + virtual void move_to(BaseState *b) noexcept = 0; + + struct InheritedAttributeInfo { + std::string_view base = ""; + std::string_view lang_tag = ""; + }; + + std::string base; + std::string lang_tag; + + explicit BaseState(InheritedAttributeInfo const &i) + : base(i.base), lang_tag(i.lang_tag) { + } + + static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; + static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; + static InheritedAttributeInfo get_inherited_attributes(ImplXML &impl, std::span attributes); + }; +} + +#endif //RDF4CPP_XMLPARSERBASESTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp new file mode 100644 index 00000000..51d72a8e --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp @@ -0,0 +1,39 @@ +#include + +#include + +namespace rdf4cpp::parser { + void IStreamQuadIterator::ImplXML::CollectionState::on_characters(ImplXML &i, std::string_view const chars) { + if (!ImplXML::trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected element, found characters"); + } + } + + void IStreamQuadIterator::ImplXML::CollectionState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + DescriptionState::enter(i, local_name, uri, attributes, [&](Node const obj) { + if (first) { + first = false; + last_bn = i.make_bn(std::nullopt); + i.add_statement(subject, predicate, last_bn, reify); + } else { + auto const bn = i.make_bn(std::nullopt); + i.add_statement(last_bn, i.make_hardcoded_iri(iri_rest), bn, IRI::make_null()); + last_bn = bn; + } + i.add_statement(last_bn, i.make_hardcoded_iri(iri_first), obj, IRI::make_null()); + }); + } + + void IStreamQuadIterator::ImplXML::CollectionState::on_end_element(ImplXML &i) { + auto const nil = i.make_hardcoded_iri(iri_nil); + if (first) { + i.add_statement(subject, predicate, nil, reify); + } else { + i.add_statement(last_bn, i.make_hardcoded_iri(iri_rest), nil, IRI::make_null()); + } + i.pop_state(); + } + void IStreamQuadIterator::ImplXML::CollectionState::move_to(BaseState *b) noexcept { + new (b) CollectionState(std::move(*this)); + } +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp new file mode 100644 index 00000000..75d1510d --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp @@ -0,0 +1,32 @@ +#ifndef XMLPARSERCOLLECTIONSTATE_HPP +#define XMLPARSERCOLLECTIONSTATE_HPP + +#include + +namespace rdf4cpp::parser { + /** + * state for https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt + */ + struct IStreamQuadIterator::ImplXMLStateCollector::CollectionState final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + Node subject; + IRI predicate; + Node last_bn = Node::make_null(); + IRI reify; + bool first = true; + + CollectionState(InheritedAttributeInfo const &i, Node sub, IRI pred, IRI reify) + : BaseState(i), subject(sub), predicate(pred), reify(reify) { + } + + static constexpr std::string_view iri_nil = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"; + static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; + static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; + }; +} // namespace rdf4cpp::parser + +#endif // XMLPARSERCOLLECTIONSTATE_HPP \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp new file mode 100644 index 00000000..9d64dffa --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp @@ -0,0 +1,88 @@ +#include + +namespace rdf4cpp::parser { + void IStreamQuadIterator::ImplXML::DescriptionState::on_characters(ImplXML &i, std::string_view const chars) { + if (!ImplXML::trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters"); + } + } + + void IStreamQuadIterator::ImplXML::DescriptionState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span attributes) { + auto const inherited_attribute_info = get_inherited_attributes(i, attributes); + IRI predicate; + if (ImplXML::iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { + predicate = i.make_iri(std::format("http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}", list_current++), inherited_attribute_info.base); + } else { + predicate = i.make_iri(uri, local_name, inherited_attribute_info.base); + } + std::optional datatype = std::nullopt; + std::optional sub = std::nullopt; + IRI reify = IRI::make_null(); + bool parse_resource = false; + bool parse_literal = false; + bool parse_collection = false; + for (auto const &att : attributes) { + if (ImplXML::iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { + datatype = i.make_iri(att.value(), inherited_attribute_info.base); + } else if (ImplXML::iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { + sub = i.make_iri(att.value(), inherited_attribute_info.base); + } else if (ImplXML::iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { + sub = i.make_bn(att.value()); + } else if (ImplXML::iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + reify = i.make_id(att.value(), inherited_attribute_info.base); + } else if (ImplXML::iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { + if (att.value() == PredicateState::parse_type_resource) { + parse_resource = true; + } else if (att.value() == PredicateState::parse_type_collection) { + parse_collection = true; + } else { + parse_literal = true; + } + } + } + for (auto const &att : attributes) { + if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { + continue; + } + if (!sub.has_value()) { + sub = i.make_bn(std::nullopt); + } + if (ImplXML::iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { + IRI const obj = i.make_iri(att.value(), base); + i.add_statement(*sub, i.make_type_iri(), obj, IRI::make_null()); + } else { + IRI const pred = i.make_iri(att.uri(), att.local_name(), base); + Literal const obj = i.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag); + i.add_statement(*sub, pred, obj, IRI::make_null()); + } + } + if (sub.has_value() && (parse_collection || parse_literal || parse_resource)) { + i.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); + } + if (datatype.has_value()) { + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, *datatype); + } else if (sub.has_value()) { + i.add_statement(subject, predicate, *sub, reify); + i.state_stack_.emplace_back(std::in_place_type_t{}); + } else if (parse_resource) { + Node const obj = i.make_bn(std::nullopt); + i.add_statement(subject, predicate, obj, reify); + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, obj); + } else if (parse_literal) { + auto &xml_state = i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); + static_cast(xml_state.get()).source_input(i); // NOLINT(*-pro-type-static-cast-downcast) + } else if (parse_collection) { + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); + } else { + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); + } + i.update_current_state(); + } + + void IStreamQuadIterator::ImplXML::DescriptionState::on_end_element(ImplXML &i) { + i.pop_state(); + } + void IStreamQuadIterator::ImplXML::DescriptionState::move_to(BaseState *b) noexcept { + new (b) DescriptionState(std::move(*this)); + } +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp new file mode 100644 index 00000000..04d8e6eb --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp @@ -0,0 +1,40 @@ +#ifndef RDF4CPP_XMLPARSERDESCRIPTIONSTATE_H +#define RDF4CPP_XMLPARSERDESCRIPTIONSTATE_H + +#include +#include + +namespace rdf4cpp::parser { + /** + * state for https://www.w3.org/TR/rdf11-xml/#nodeElementList and https://www.w3.org/TR/rdf11-xml/#nodeElement + * on_start_element checks and dispatches for the different options in https://www.w3.org/TR/rdf11-xml/#propertyEltList + * (https://www.w3.org/TR/rdf11-xml/#parseTypeResourcePropertyElt has no own state, instead gets handled directly by on_start_element) + */ + struct IStreamQuadIterator::ImplXMLStateCollector::DescriptionState final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + Node subject; + size_t list_current = 1; + + explicit DescriptionState(InheritedAttributeInfo const &i, Node sub) + : BaseState(i), subject(sub) { + } + + /** + * include XMLParserDescriptionStateEnter.hpp to use it + */ + template + static void enter(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes, F f); + + static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; + static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; + static constexpr std::string_view id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID"; + static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; + static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; + }; +} + +#endif //RDF4CPP_XMLPARSERDESCRIPTIONSTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionStateEnter.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionStateEnter.hpp new file mode 100644 index 00000000..0636ea26 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionStateEnter.hpp @@ -0,0 +1,59 @@ +#include + +namespace rdf4cpp::parser { + template + void IStreamQuadIterator::ImplXML::DescriptionState::enter(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { + auto const inherited_attribute_info = get_inherited_attributes(i, attributes); + Node sub = Node::make_null(); + auto check_only_one = [&sub, &i]() { + if (!sub.null()) { + i.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:ID, rdf:about, and rdf:nodeID"); + return true; + } + return false; + }; + for (auto const &att : attributes) { + if (ImplXML::iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { + if (check_only_one()) { + continue; + } + sub = i.make_iri(att.value(), inherited_attribute_info.base); + } else if (ImplXML::iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + if (check_only_one()) { + continue; + } + sub = i.make_id(att.value(), inherited_attribute_info.base); + } else if (ImplXML::iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { + if (check_only_one()) { + continue; + } + sub = i.make_bn(att.value()); + } + } + if (sub.null()) { + sub = i.make_bn(std::nullopt); + } + if (!ImplXML::iri_equal_pieces(start_element, uri, local_name)) { + IRI const obj = i.make_iri(uri, local_name, inherited_attribute_info.base); + if (!obj.null()) { + i.add_statement(sub, i.make_type_iri(), obj, IRI::make_null()); + } + } + for (auto const &att : attributes) { + if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { + continue; + } + if (ImplXML::iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { + IRI const obj = i.make_iri(att.value(), inherited_attribute_info.base); + i.add_statement(sub, i.make_type_iri(), obj, IRI::make_null()); + } else { + IRI const pred = i.make_iri(att.uri(), att.local_name(), inherited_attribute_info.base); + Literal const obj = i.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag); + i.add_statement(sub, pred, obj, IRI::make_null()); + } + } + f(sub); + i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, sub); + i.update_current_state(); + } +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp new file mode 100644 index 00000000..7b488fcd --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp @@ -0,0 +1,20 @@ +#include + +namespace rdf4cpp::parser { + void IStreamQuadIterator::ImplXML::EmptyElement::on_characters(ImplXML &i, std::string_view const chars) { + if (!ImplXML::trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); + } + } + + void IStreamQuadIterator::ImplXML::EmptyElement::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???"); + } + + void IStreamQuadIterator::ImplXML::EmptyElement::on_end_element(ImplXML &i) { + i.pop_state(); + } + void IStreamQuadIterator::ImplXML::EmptyElement::move_to(BaseState *b) noexcept { + new (b) EmptyElement(std::move(*this)); + } +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp new file mode 100644 index 00000000..cc47038f --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp @@ -0,0 +1,22 @@ +#ifndef XMLPARSEREMPTYELEMENT_HPP +#define XMLPARSEREMPTYELEMENT_HPP + +#include + +namespace rdf4cpp::parser { + /** + * state for https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (if attributes are present) + */ + struct IStreamQuadIterator::ImplXMLStateCollector::EmptyElement final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + EmptyElement() + : BaseState({}) { + } + }; +} // namespace rdf4cpp::parser + +#endif // XMLPARSEREMPTYELEMENT_HPP \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp new file mode 100644 index 00000000..29b35271 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp @@ -0,0 +1,25 @@ +#include + +namespace rdf4cpp::parser { + void IStreamQuadIterator::ImplXML::InitialState::on_characters(ImplXML &i, std::string_view const chars) { + if (!ImplXML::trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); + } + } + + void IStreamQuadIterator::ImplXML::InitialState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { + if (ImplXML::iri_equal_pieces(RDFState::start_element, uri, local_name)) { + i.state_stack_.emplace_back(std::in_place_type_t{}, get_inherited_attributes(i, attributes)); + i.update_current_state(); + return; + } + i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); + } + + void IStreamQuadIterator::ImplXML::InitialState::on_end_element(ImplXML &i) { + i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); + } + void IStreamQuadIterator::ImplXML::InitialState::move_to(BaseState *b) noexcept { + new (b) InitialState(std::move(*this)); + } +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp new file mode 100644 index 00000000..ab341f60 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp @@ -0,0 +1,23 @@ +#ifndef RDF4CPP_XMLPARSERINITIALSTATE_H +#define RDF4CPP_XMLPARSERINITIALSTATE_H + +#include +#include + +namespace rdf4cpp::parser { + /** + * initial state, checks for start of https://www.w3.org/TR/rdf11-xml/#RDF + */ + struct IStreamQuadIterator::ImplXMLStateCollector::InitialState final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + InitialState() + : BaseState({}) { + } + }; +} + +#endif //RDF4CPP_XMLPARSERINITIALSTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp new file mode 100644 index 00000000..5d656bca --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp @@ -0,0 +1,47 @@ +#include + +#include + +namespace rdf4cpp::parser { + + void IStreamQuadIterator::ImplXML::PredicateState::on_characters([[maybe_unused]] ImplXML &i, std::string_view const chars) { + if (done) { + if (!ImplXML::trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal"); + } + return; + } + literal.append(chars); + } + + void IStreamQuadIterator::ImplXML::PredicateState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + if (!ImplXML::trim_left(literal).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element"); + return; + } + if (done) { + i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found element"); + return; + } + DescriptionState::enter(i, local_name, uri, attributes, [&](Node obj) { + done = true; + i.add_statement(subject, predicate, obj, reify); + }); + } + + void IStreamQuadIterator::ImplXML::PredicateState::on_end_element(ImplXML &i) { + if (!done) { + Literal const lit = i.make_literal(literal, std::nullopt, std::nullopt); + i.add_statement(subject, predicate, lit, reify); + } + i.pop_state(); + } + void IStreamQuadIterator::ImplXML::PredicateState::move_to(BaseState *b) noexcept { + new (b) PredicateState(std::move(*this)); + } + + bool IStreamQuadIterator::ImplXML::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { + return ImplXML::iri_reserved(uri, local_name) || ImplXML::iri_equal_pieces(DescriptionState::start_element, uri, local_name) || ImplXML::iri_equal_pieces(list_start_element, uri, local_name); + } + +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp new file mode 100644 index 00000000..20afa6e5 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp @@ -0,0 +1,40 @@ +#ifndef RDF4CPP_XMLPARSERPREDICATESTATE_H +#define RDF4CPP_XMLPARSERPREDICATESTATE_H + +#include +#include + +namespace rdf4cpp::parser { + /** + * state for https://www.w3.org/TR/rdf11-xml/#resourcePropertyElt (nested nodeElement / DescriptionState + * and https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (literal with no datatype attribute) + * and https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (with no attributes (empty literal)) + */ + struct IStreamQuadIterator::ImplXMLStateCollector::PredicateState : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + Node subject; + IRI predicate; + IRI reify; + std::string literal; + bool done = false; + + PredicateState(InheritedAttributeInfo const &i, Node sub, IRI predicate, IRI reify) + : BaseState(i), subject(sub), predicate(predicate), reify(reify) { + } + + static constexpr std::string_view resource_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource"; + static constexpr std::string_view parse_type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType"; + static constexpr std::string_view parse_type_resource = "Resource"; + static constexpr std::string_view parse_type_literal = "Literal"; + static constexpr std::string_view parse_type_collection = "Collection"; + static constexpr std::string_view list_start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li"; + + static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); + }; +} + +#endif //RDF4CPP_XMLPARSERPREDICATESTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp new file mode 100644 index 00000000..03ef47df --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp @@ -0,0 +1,23 @@ +#include + +#include + +namespace rdf4cpp::parser { + void IStreamQuadIterator::ImplXML::RDFState::on_characters(ImplXML &i, std::string_view const chars) { + if (!ImplXML::trim_left(chars).empty()) { + i.add_error(ParsingError::Type::BadSyntax, "expected Description, found characters"); + } + } + + void IStreamQuadIterator::ImplXML::RDFState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { + DescriptionState::enter(i, local_name, uri, attributes, [](auto) { + }); + } + + void IStreamQuadIterator::ImplXML::RDFState::on_end_element(ImplXML &i) { + i.pop_state(); + } + void IStreamQuadIterator::ImplXML::RDFState::move_to(BaseState *b) noexcept { + new (b) RDFState(std::move(*this)); + } +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp new file mode 100644 index 00000000..34d68fa7 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp @@ -0,0 +1,23 @@ +#ifndef RDF4CPP_XMLPARSERRDFSTATE_H +#define RDF4CPP_XMLPARSERRDFSTATE_H + +#include +#include + +namespace rdf4cpp::parser { + /** + * state for https://www.w3.org/TR/rdf11-xml/#RDF + */ + struct IStreamQuadIterator::ImplXMLStateCollector::RDFState final : BaseState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; + + using BaseState::BaseState; + }; +} + +#endif //RDF4CPP_XMLPARSERRDFSTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp new file mode 100644 index 00000000..c0380903 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp @@ -0,0 +1,18 @@ +#include + +namespace rdf4cpp::parser { + void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + i.add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); + } + + void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_end_element(ImplXML &i) { + if (!datatype.null()) { + Literal const lit = i.make_literal(literal, datatype, std::nullopt); + i.add_statement(subject, predicate, lit, reify); + } + i.pop_state(); + } + void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::move_to(BaseState *b) noexcept { + new (b) TypedLiteralPredicateState(std::move(*this)); + } +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp new file mode 100644 index 00000000..e002dcc8 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp @@ -0,0 +1,26 @@ +#ifndef RDF4CPP_XMLPARSERTYPEDLITERALPREDICATESTATE_H +#define RDF4CPP_XMLPARSERTYPEDLITERALPREDICATESTATE_H + +#include +#include + +namespace rdf4cpp::parser { + /** + * state for https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (with datatype attribute) + */ + struct IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState final : PredicateState { + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + IRI datatype; + + TypedLiteralPredicateState(InheritedAttributeInfo const &i, Node iri, IRI predicate, IRI reify, IRI datatype) + : PredicateState(i, iri, predicate, reify), datatype(datatype) { + } + + static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; + }; +} + +#endif //RDF4CPP_XMLPARSERTYPEDLITERALPREDICATESTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp new file mode 100644 index 00000000..6715a39a --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp @@ -0,0 +1,52 @@ +#include + +namespace rdf4cpp::parser { + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_characters(ImplXML &i, [[maybe_unused]] std::string_view chars) { + source_input(i); + } + + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + ++depth; + source_input(i); + } + + void IStreamQuadIterator::ImplXML::XMLLiteralState::on_end_element(ImplXML &i) { + if (depth > 0) { + --depth; + source_input(i); + return; + } + IRI datatype = i.make_hardcoded_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"); + std::string_view l = literal; + l = l.substr(0, last_offset); + l.remove_prefix(data_start); + if (!l.empty() && l[0] == '/') { + l.remove_prefix(1); + } + if (!l.empty() && l[0] == '>') { + l.remove_prefix(1); + } + Literal const lit = i.make_literal(l, datatype, std::nullopt); + i.add_statement(subject, predicate, lit, reify); + i.pop_state(); + } + void IStreamQuadIterator::ImplXML::XMLLiteralState::move_to(BaseState *b) noexcept { + new (b) XMLLiteralState(std::move(*this)); + } + + void IStreamQuadIterator::ImplXML::XMLLiteralState::source_input(ImplXML &i) { + xmlChar const *data; + int size = 1024; + int off = 0; + xmlCtxtGetInputWindow(i.context_.get(), 0, &data, &size, &off); + std::string_view const sv{reinterpret_cast(data), static_cast(size)}; + if (literal.empty()) { + data_start = off; + } + if (!static_cast(literal).ends_with(sv)) { + last_size = literal.size(); + literal += sv; + } + last_offset = static_cast(off) + last_size; + } +} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp new file mode 100644 index 00000000..8efd6e19 --- /dev/null +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp @@ -0,0 +1,29 @@ +#ifndef RDF4CPP_XMLPARSERXMLLITERALESTATE_H +#define RDF4CPP_XMLPARSERXMLLITERALESTATE_H + +#include +#include + +namespace rdf4cpp::parser { + /** + * state for https://www.w3.org/TR/rdf11-xml/#parseTypeLiteralPropertyElt + * (and https://www.w3.org/TR/rdf11-xml/#parseTypeOtherPropertyElt) + */ + struct IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState final : PredicateState { + void on_characters(ImplXML &i, std::string_view chars) override; + void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; + void on_end_element(ImplXML &i) override; + void move_to(BaseState *b) noexcept override; + + size_t depth = 0; + size_t data_start = 0; + size_t last_offset = 0; + size_t last_size = 0; + + using PredicateState::PredicateState; + + void source_input(ImplXML &i); + }; +} + +#endif //RDF4CPP_XMLPARSERXMLLITERALESTATE_H diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.hpp b/src/rdf4cpp/parser/IStreamQuadIterator.hpp index 9ab9092d..d1f9c13d 100644 --- a/src/rdf4cpp/parser/IStreamQuadIterator.hpp +++ b/src/rdf4cpp/parser/IStreamQuadIterator.hpp @@ -102,6 +102,7 @@ struct IStreamQuadIterator { Impl &operator=(Impl &&) = delete; }; struct ImplSerd; + struct ImplXMLStateCollector; struct ImplXML; std::unique_ptr impl; From 5ca0c2f4a9be3472e978a71a60110ee3054b0595 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 3 Dec 2025 14:50:32 +0100 Subject: [PATCH 31/42] fix gcc14 bug again --- private/rdf4cpp/parser/XMLParser.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp index f4670ac5..e25e5bb5 100644 --- a/private/rdf4cpp/parser/XMLParser.hpp +++ b/private/rdf4cpp/parser/XMLParser.hpp @@ -36,10 +36,14 @@ namespace rdf4cpp::parser { struct IStreamQuadIterator::ImplXML final : ImplXMLStateCollector { private: xmlSAXHandler handler_; - std::unique_ptr - context_; + // workaround for gcc-14 bug, erroneously warns on unsing a lambda here + // see https://github.com/NVIDIA/stdexec/issues/1143 + struct XmlParserCtxtDtorLambda { + void operator()(xmlParserCtxt* c) const { + xmlFreeParserCtxt(c); + } + }; + std::unique_ptr context_; void *reader_obj_; ReadFunc read_func_; ErrorFunc error_func_; From 6cf39d5d7a5d8e1c61b6eec8bfe6933686143803 Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 4 Dec 2025 17:24:36 +0100 Subject: [PATCH 32/42] remove iconv --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 4ea161ab..a4891bb4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -39,7 +39,7 @@ def requirements(self): self.requires("dice-hash/0.4.11", transitive_headers=True) self.requires("dice-sparse-map/0.2.9", transitive_headers=True) self.requires("dice-template-library/1.19.0", transitive_headers=True) - self.requires("libxml2/2.15.0") + self.requires("libxml2/2.15.0", options={"iconv": False}) if self.options.with_test_deps: self.test_requires("doctest/2.4.11") From f8f6454cbeb7f33879c40a1e37ce02f4a8f209e1 Mon Sep 17 00:00:00 2001 From: mcb Date: Tue, 9 Dec 2025 17:19:19 +0100 Subject: [PATCH 33/42] states no longer get a Impl& --- CMakeLists.txt | 1 + private/rdf4cpp/parser/XMLParser.cpp | 228 +++++------------- private/rdf4cpp/parser/XMLParser.hpp | 95 +++----- .../parser/XMLParserStateCollector.cpp | 167 +++++++++++++ .../parser/XMLParserStateCollector.hpp | 83 ++++++- .../parser/XMLStates/XMLParserBaseState.cpp | 10 +- .../parser/XMLStates/XMLParserBaseState.hpp | 10 +- .../XMLStates/XMLParserCollectionState.cpp | 49 ++-- .../XMLStates/XMLParserCollectionState.hpp | 6 +- .../XMLStates/XMLParserDescriptionState.cpp | 134 +++++++--- .../XMLStates/XMLParserDescriptionState.hpp | 12 +- .../XMLParserDescriptionStateEnter.hpp | 59 ----- .../XMLStates/XMLParserEmptyElement.cpp | 20 +- .../XMLStates/XMLParserEmptyElement.hpp | 6 +- .../XMLStates/XMLParserInitialState.cpp | 30 ++- .../XMLStates/XMLParserInitialState.hpp | 6 +- .../XMLStates/XMLParserPredicateState.cpp | 47 ++-- .../XMLStates/XMLParserPredicateState.hpp | 6 +- .../parser/XMLStates/XMLParserRDFState.cpp | 21 +- .../parser/XMLStates/XMLParserRDFState.hpp | 6 +- .../XMLParserTypedLiteralPredicateState.cpp | 17 +- .../XMLParserTypedLiteralPredicateState.hpp | 4 +- .../XMLStates/XMLParserXMLLiteralState.cpp | 37 ++- .../XMLStates/XMLParserXMLLiteralState.hpp | 12 +- 24 files changed, 581 insertions(+), 485 deletions(-) create mode 100644 private/rdf4cpp/parser/XMLParserStateCollector.cpp delete mode 100644 private/rdf4cpp/parser/XMLStates/XMLParserDescriptionStateEnter.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e6d25e5..851d4e78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,6 +151,7 @@ add_library(rdf4cpp src/rdf4cpp/util/Anonymizer.cpp private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp private/rdf4cpp/parser/XMLParser.cpp + private/rdf4cpp/parser/XMLParserStateCollector.cpp private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index c42e4edf..1f02b9ec 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -15,41 +15,35 @@ namespace rdf4cpp::parser { }; r.characters = [](void *th, xmlChar const *e, int const len) { auto *t = static_cast(th); - t->current_state_->on_characters(*t, from_xml_char(e, len)); + t->handle_state_transition(t->current_state_->on_characters(t->output_, from_xml_char(e, len), t->make_info())); }; r.startElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { auto *t = static_cast(th); - t->current_state_->on_start_element(*t, from_xml_char(local_name), from_xml_char(uri), - std::span{reinterpret_cast(attributes), static_cast(n_attributes)}); + t->handle_state_transition(t->current_state_->on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), + std::span{reinterpret_cast(attributes), static_cast(n_attributes)}, t->make_info())); }; r.endElementNs = [](void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { auto *t = static_cast(th); - t->current_state_->on_end_element(*t); + t->handle_state_transition(t->current_state_->on_end_element(t->output_, t->make_info())); }; r.warning = on_error; r.error = on_error; return r; } - - void IStreamQuadIterator::ImplXML::add_error(ParsingError::Type const ty, std::string msg) { - uint64_t const lin = xmlSAX2GetLineNumber(context_.get()); - uint64_t const col = xmlSAX2GetColumnNumber(context_.get()); - result_queue_.emplace_back(nonstd::unexpect, ty, lin, col, std::move(msg)); - } - - void IStreamQuadIterator::ImplXML::add_statement(Node const subject, IRI const predicate, Node const object, IRI const reify) { - if (subject.null() || predicate.null() || object.null()) { - return; - } - result_queue_.emplace_back(Quad(subject, predicate, object)); - if (!reify.null()) { - result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_subject), subject)); - result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_predicate), predicate)); - result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_object), object)); - result_queue_.emplace_back(Quad(reify, make_type_iri(), make_hardcoded_iri(reify_type))); - } + void IStreamQuadIterator::ImplXML::handle_state_transition(StateTransition transition) { + std::visit([&](T &&s) { + if constexpr (std::same_as) { + return; + } else if constexpr (std::same_as) { + pop_state(); + } else { + state_stack_.emplace_back(std::in_place_type_t{}, std::forward(s)); + update_current_state(); + } + }, + std::move(transition.modify_state)); } void IStreamQuadIterator::ImplXML::update_current_state() { @@ -66,24 +60,8 @@ namespace rdf4cpp::parser { update_current_state(); } - std::string_view IStreamQuadIterator::ImplXML::trim_left(std::string_view v) { - auto s = v.find_first_not_of(" \t\r\n"); - if (s == std::string_view::npos) { - return ""; - } - v.remove_prefix(s); - // ReSharper disable once CppDFALocalValueEscapesFunction - return v; - } - - bool IStreamQuadIterator::ImplXML::iri_equal_pieces(std::string_view const full_iri, std::string_view const uri, std::string_view const local_name) { - if (full_iri.size() != local_name.size() + uri.size()) { - return false; - } - return full_iri.starts_with(uri) && full_iri.ends_with(local_name); - } - - bool IStreamQuadIterator::ImplXML::iri_reserved(std::string_view uri, std::string_view local_name) { + // implemented here, to have access to states + bool IStreamQuadIterator::ImplXMLStateCollector::iri_reserved(std::string_view const uri, std::string_view const local_name) { static constexpr std::array reserved = { RDFState::start_element, DescriptionState::id_attrib, @@ -103,116 +81,6 @@ namespace rdf4cpp::parser { }); } - template - NT IStreamQuadIterator::ImplXML::inspect_node(NT node) { - try { - state_->inspect_node_func(node); - return node; - } catch (std::exception &e) { - add_error(ParsingError::Type::BadSyntax, std::format("Triple explicitly skipped by inspect function: {}", e.what())); - } catch (...) { - add_error(ParsingError::Type::BadSyntax, "Triple explicitly skipped by inspect function"); - } - return NT::make_null(); - } - - IRI IStreamQuadIterator::ImplXML::make_iri(std::string_view const iri, std::string_view const base) { - if (base.empty()) { - for (auto const &s : state_stack_ | std::ranges::views::reverse) { - std::string_view const v = s.get().base; - if (!v.empty()) { - state_->iri_factory.set_base_unchecked(v); - break; - } - } - } else { - state_->iri_factory.set_base_unchecked(base); - } - auto exp = state_->iri_factory.from_maybe_relative(iri, state_->node_storage); - if (exp.has_value()) { - return inspect_node(*exp); - } else { - add_error(ParsingError::Type::BadIri, std::format("{}: {}", iri, exp.error())); - return IRI::make_null(); - } - } - - IRI IStreamQuadIterator::ImplXML::make_iri(std::string_view const uri, std::string_view const local_name, std::string_view const base) { - std::string iri{uri}; - iri.append(local_name); - return make_iri(iri, base); - } - - IRI IStreamQuadIterator::ImplXML::make_id(std::string_view const local_name, std::string_view const base) { - std::string local = "#"; - local.append(local_name); - auto iri = make_iri(local, base); - if (reserved_ids_.contains(iri.backend_handle().id())) { - add_error(ParsingError::Type::BadIri, std::format("{}: is already used as a rdf:ID", iri)); - return IRI::make_null(); - } - reserved_ids_.insert(iri.backend_handle().id()); - return iri; - } - - IRI IStreamQuadIterator::ImplXML::make_hardcoded_iri(std::string_view const iri) const { - return IRI::make_unchecked(iri, state_->node_storage); - } - - IRI IStreamQuadIterator::ImplXML::make_type_iri() const { - return IRI::rdf_type(state_->node_storage); - } - - Node IStreamQuadIterator::ImplXML::make_bn(std::optional name) { - std::string n = ""; - if (!name.has_value()) { - n = std::format("bn_{}", next_bn_index_++); - name = n; - } - try { - if (state_->blank_node_scope_manager == nullptr) { - return inspect_node(BlankNode::make(*name, state_->node_storage)); - } else { - return inspect_node(state_->blank_node_scope_manager.scope("").get_or_generate_node(*name, state_->node_storage)); - } - } catch (InvalidNode const &e) { - add_error(ParsingError::Type::BadBlankNode, e.what()); - return BlankNode::make_null(); - } catch (...) { - add_error(ParsingError::Type::BadBlankNode, "unknown error"); - return BlankNode::make_null(); - } - } - - Literal IStreamQuadIterator::ImplXML::make_literal(std::string_view value, std::optional datatype, std::optional lang_tag) { - Literal l = Literal::make_null(); - try { - if (datatype.has_value()) { - l = Literal::make_typed(value, *datatype, state_->node_storage); - } else { - if (!lang_tag.has_value() || lang_tag->empty()) { - for (auto const &s : state_stack_ | std::ranges::views::reverse) { - std::string_view const v = s.get().lang_tag; - if (!v.empty()) { - lang_tag = v; - break; - } - } - } - if (lang_tag.has_value() && !lang_tag->empty()) { - l = Literal::make_lang_tagged(value, *lang_tag, state_->node_storage); - } else { - l = Literal::make_simple(value, state_->node_storage); - } - } - } catch (InvalidNode const &e) { - add_error(ParsingError::Type::BadLiteral, e.what()); - } catch (...) { - add_error(ParsingError::Type::BadLiteral, "unknown error"); - } - return inspect_node(l); - } - void IStreamQuadIterator::ImplXML::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) va_list args; auto t = static_cast(th); @@ -225,45 +93,67 @@ namespace rdf4cpp::parser { } else { out = "unknown error, too long to fit"; } - t->add_error(ParsingError::Type::BadSyntax, std::move(out)); + t->output_.add_error(ParsingError::Type::BadSyntax, std::move(out), t->make_info()); va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } + IStreamQuadIterator::ImplXMLStateCollector::Info IStreamQuadIterator::ImplXML::make_info() const { + std::string_view base = ""; + for (auto const &s : state_stack_ | std::ranges::views::reverse) { + std::string_view const v = s.get().base; + if (!v.empty()) { + base = v; + break; + } + } + + std::string_view lang_tag = ""; + for (auto const &s : state_stack_ | std::ranges::views::reverse) { + std::string_view const v = s.get().lang_tag; + if (!v.empty()) { + lang_tag = v; + break; + } + } + + xmlChar const *data; + int size = 1024; + int off = 0; + xmlCtxtGetInputWindow(context_.get(), 0, &data, &size, &off); + std::string_view const source{reinterpret_cast(data), static_cast(size)}; + + return Info{ + static_cast(xmlSAX2GetLineNumber(context_.get())), + static_cast(xmlSAX2GetColumnNumber(context_.get())), + base, + lang_tag, + source, + off, + }; + } + IStreamQuadIterator::ImplXML::ImplXML(void *obj, ReadFunc const read, ErrorFunc const err, EOFFunc const eof, state_type *state) : handler_(make_sax_handler()), context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "rdf/xml")), reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof), - state_(state) { + output_(state) { xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); state_stack_.reserve(10); state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); - if (state_ == nullptr) { - state_ = new state_type(); - state_is_owned_ = true; - } - - current_state_->base = state_->iri_factory.get_base(); + current_state_->base = output_.current_base_iri(); } - IStreamQuadIterator::ImplXML::~ImplXML() { - if (state_is_owned_) { - delete state_; - } + IStreamQuadIterator::ImplXML::~ImplXML() { // NOLINT(*-use-equals-default) } std::optional IStreamQuadIterator::ImplXML::next() { std::array buffer; // NOLINT(*-pro-type-member-init) - while (result_queue_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) { + while (output_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) { auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_); xmlParseChunk(context_.get(), buffer.data(), static_cast(read), eof_func_(reader_obj_) != 0); } - if (result_queue_.empty()) { - return std::nullopt; - } - auto r = result_queue_.front(); - result_queue_.pop_front(); - return r; + return output_.next(); } uint64_t IStreamQuadIterator::ImplXML::current_line() const noexcept { diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp index e25e5bb5..9eb7e476 100644 --- a/private/rdf4cpp/parser/XMLParser.hpp +++ b/private/rdf4cpp/parser/XMLParser.hpp @@ -1,8 +1,6 @@ #ifndef RDF4CPP_XMLPARSER_H #define RDF4CPP_XMLPARSER_H -#include -#include #include #include @@ -11,8 +9,6 @@ #include #include -#include -#include #include @@ -22,15 +18,23 @@ #include +#include + #include + +#include + +#include + #include + +#include + #include -#include + #include -#include + #include -#include -#include namespace rdf4cpp::parser { struct IStreamQuadIterator::ImplXML final : ImplXMLStateCollector { @@ -39,7 +43,7 @@ namespace rdf4cpp::parser { // workaround for gcc-14 bug, erroneously warns on unsing a lambda here // see https://github.com/NVIDIA/stdexec/issues/1143 struct XmlParserCtxtDtorLambda { - void operator()(xmlParserCtxt* c) const { + void operator()(xmlParserCtxt *c) const { xmlFreeParserCtxt(c); } }; @@ -48,71 +52,21 @@ namespace rdf4cpp::parser { ReadFunc read_func_; ErrorFunc error_func_; EOFFunc eof_func_; - std::deque result_queue_; - size_t next_bn_index_ = 0; - state_type *state_; - bool state_is_owned_ = false; - dice::sparse_map::sparse_set reserved_ids_; - - static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; - static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; - static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; - static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; - - friend struct BaseState; - friend struct InitialState; - friend struct RDFState; - friend struct DescriptionState; - friend struct PredicateState; - friend struct TypedLiteralPredicateState; - friend struct EmptyElement; - friend struct XMLLiteralState; - friend struct CollectionState; + XMLOutputQueue output_; BaseState *current_state_ = nullptr; std::vector> state_stack_; static xmlSAXHandler make_sax_handler(); - void add_error(ParsingError::Type ty, std::string msg); - /** - * add statement to the output list, if none of the components is null - * (null is used to track an already inserted parse error for that component) - */ - void add_statement(Node subject, IRI predicate, Node object, IRI reify); + void handle_state_transition(StateTransition transition); void update_current_state(); void pop_state(); - /** - * removes whitespace according to xml spec - */ - [[nodiscard]] static std::string_view trim_left(std::string_view v); - [[nodiscard]] static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); - [[nodiscard]] static bool iri_reserved(std::string_view uri, std::string_view local_name); - template - [[nodiscard]] NT inspect_node(NT node); - [[nodiscard]] IRI make_iri(std::string_view iri, std::string_view base); - [[nodiscard]] IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base); - /** - * create the IRI for an id_attrib, including uniqueness check - */ - [[nodiscard]] IRI make_id(std::string_view local_name, std::string_view base); - /** - * create an IRI with no checks, intended for hardcoded IRIs like reify_subject - */ - [[nodiscard]] IRI make_hardcoded_iri(std::string_view iri) const; - [[nodiscard]] IRI make_type_iri() const; - [[nodiscard]] Node make_bn(std::optional name); - /** - * creates a literal - * @param value - * @param datatype - * @param lang_tag (ignored, if datatype is set) - * @return - */ - [[nodiscard]] Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag); static void on_error(void *th, char const *msg, ...); + [[nodiscard]] Info make_info() const; + public: ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); ~ImplXML() override; @@ -127,6 +81,19 @@ namespace rdf4cpp::parser { [[nodiscard]] uint64_t current_line() const noexcept override; [[nodiscard]] uint64_t current_column() const noexcept override; }; -} + + struct IStreamQuadIterator::ImplXMLStateCollector::StateTransition { + using ModifyStateStack = std::variant; + + ModifyStateStack modify_state; + + template + explicit StateTransition(T &&...a) : modify_state(std::forward(a)...) { + } + + StateTransition() : StateTransition(std::in_place_type_t{}) { + } + }; +} // namespace rdf4cpp::parser #endif //RDF4CPP_XMLPARSER_H diff --git a/private/rdf4cpp/parser/XMLParserStateCollector.cpp b/private/rdf4cpp/parser/XMLParserStateCollector.cpp new file mode 100644 index 00000000..693cec2b --- /dev/null +++ b/private/rdf4cpp/parser/XMLParserStateCollector.cpp @@ -0,0 +1,167 @@ +#include + + +namespace rdf4cpp::parser { + IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::XMLOutputQueue(state_type *state) : state_(state) { + if (state_ == nullptr) { + state_ = new state_type(); + state_is_owned_ = true; + } + } + + IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::~XMLOutputQueue() { + if (state_is_owned_) { + delete state_; + } + } + + bool IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::empty() const { + return result_queue_.empty(); + } + + std::optional IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::next() { + if (result_queue_.empty()) { + return std::nullopt; + } + auto r = result_queue_.front(); + result_queue_.pop_front(); + return r; + } + + std::string_view IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::current_base_iri() const { + return state_->iri_factory.get_base(); + } + + void IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::add_error(ParsingError::Type ty, std::string msg, Info const &i) { + result_queue_.emplace_back(nonstd::unexpect, ty, i.line, i.column, std::move(msg)); + } + + void IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::add_statement(Node subject, IRI predicate, Node object, IRI reify) { + if (subject.null() || predicate.null() || object.null()) { + return; + } + result_queue_.emplace_back(Quad(subject, predicate, object)); + if (!reify.null()) { + result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_subject), subject)); + result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_predicate), predicate)); + result_queue_.emplace_back(Quad(reify, make_hardcoded_iri(reify_object), object)); + result_queue_.emplace_back(Quad(reify, make_type_iri(), make_hardcoded_iri(reify_type))); + } + } + + IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_hardcoded_iri(std::string_view const iri) const { + return IRI::make_unchecked(iri, state_->node_storage); + } + + IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_type_iri() const { + return IRI::rdf_type(state_->node_storage); + } + + template + NT IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::inspect_node(NT node, Info const &i) { + try { + state_->inspect_node_func(node); + return node; + } catch (std::exception &e) { + add_error(ParsingError::Type::BadSyntax, std::format("Triple explicitly skipped by inspect function: {}", e.what()), i); + } catch (...) { + add_error(ParsingError::Type::BadSyntax, "Triple explicitly skipped by inspect function", i); + } + return NT::make_null(); + } + + IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_iri(std::string_view const iri, std::string_view const base, Info const &i) { + if (base.empty()) { + state_->iri_factory.set_base_unchecked(i.base); + } else { + state_->iri_factory.set_base_unchecked(base); + } + auto exp = state_->iri_factory.from_maybe_relative(iri, state_->node_storage); + if (exp.has_value()) { + return inspect_node(*exp, i); + } else { + add_error(ParsingError::Type::BadIri, std::format("{}: {}", iri, exp.error()), i); + return IRI::make_null(); + } + } + + IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_iri(std::string_view const uri, std::string_view const local_name, std::string_view const base, Info const &i) { + std::string iri{uri}; + iri.append(local_name); + return make_iri(iri, base, i); + } + + IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_id(std::string_view const local_name, std::string_view const base, Info const &i) { + std::string local = "#"; + local.append(local_name); + auto iri = make_iri(local, base, i); + if (reserved_ids_.contains(iri.backend_handle().id())) { + add_error(ParsingError::Type::BadIri, std::format("{}: is already used as a rdf:ID", iri), i); + return IRI::make_null(); + } + reserved_ids_.insert(iri.backend_handle().id()); + return iri; + } + + Node IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_bn(std::optional name, Info const &i) { + std::string n = ""; + if (!name.has_value()) { + n = std::format("bn_{}", next_bn_index_++); + name = n; + } + try { + if (state_->blank_node_scope_manager == nullptr) { + return inspect_node(BlankNode::make(*name, state_->node_storage), i); + } else { + return inspect_node(state_->blank_node_scope_manager.scope("").get_or_generate_node(*name, state_->node_storage), i); + } + } catch (InvalidNode const &e) { + add_error(ParsingError::Type::BadBlankNode, e.what(), i); + return BlankNode::make_null(); + } catch (...) { + add_error(ParsingError::Type::BadBlankNode, "unknown error", i); + return BlankNode::make_null(); + } + } + + Literal IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_literal(std::string_view value, std::optional datatype, std::optional lang_tag, Info const &i) { + Literal l = Literal::make_null(); + try { + if (datatype.has_value()) { + l = Literal::make_typed(value, *datatype, state_->node_storage); + } else { + if (!lang_tag.has_value() || lang_tag->empty()) { + lang_tag = i.lang_tag; + } + if (lang_tag.has_value() && !lang_tag->empty()) { + l = Literal::make_lang_tagged(value, *lang_tag, state_->node_storage); + } else { + l = Literal::make_simple(value, state_->node_storage); + } + } + } catch (InvalidNode const &e) { + add_error(ParsingError::Type::BadLiteral, e.what(), i); + } catch (...) { + add_error(ParsingError::Type::BadLiteral, "unknown error", i); + } + return inspect_node(l, i); + } + + std::string_view IStreamQuadIterator::ImplXMLStateCollector::trim_left(std::string_view v) { + auto s = v.find_first_not_of(" \t\r\n"); + if (s == std::string_view::npos) { + return ""; + } + v.remove_prefix(s); + // ReSharper disable once CppDFALocalValueEscapesFunction + return v; + } + + bool IStreamQuadIterator::ImplXMLStateCollector::iri_equal_pieces(std::string_view const full_iri, std::string_view const uri, std::string_view const local_name) { + if (full_iri.size() != local_name.size() + uri.size()) { + return false; + } + return full_iri.starts_with(uri) && full_iri.ends_with(local_name); + } + +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLParserStateCollector.hpp b/private/rdf4cpp/parser/XMLParserStateCollector.hpp index 00dc3c95..12c0e957 100644 --- a/private/rdf4cpp/parser/XMLParserStateCollector.hpp +++ b/private/rdf4cpp/parser/XMLParserStateCollector.hpp @@ -2,8 +2,6 @@ #define RDF4CPP_XMLPARSERSTATECOLLECTOR_H #include -#include -#include #include @@ -18,8 +16,6 @@ #include -#include - namespace rdf4cpp::parser { struct IStreamQuadIterator::ImplXMLStateCollector : Impl { protected: @@ -84,6 +80,85 @@ namespace rdf4cpp::parser { struct CollectionState; struct EmptyElement; + + struct PopState {}; + struct NoStateChange {}; + + struct Info { + uint64_t line; + uint64_t column; + std::string_view base; + std::string_view lang_tag; + std::string_view source; + int source_offset; + }; + + struct XMLOutputQueue { + private: + std::deque result_queue_; + size_t next_bn_index_ = 0; + state_type *state_; + bool state_is_owned_ = false; + dice::sparse_map::sparse_set reserved_ids_; + + static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; + static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; + static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; + static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; + + template + [[nodiscard]] NT inspect_node(NT node, Info const &i); + + public: + explicit XMLOutputQueue(state_type *state); + ~XMLOutputQueue(); + + XMLOutputQueue(XMLOutputQueue const &) = delete; + XMLOutputQueue &operator=(XMLOutputQueue const &) = delete; + XMLOutputQueue(XMLOutputQueue &&) = delete; + XMLOutputQueue &operator=(XMLOutputQueue &&) = delete; + + [[nodiscard]] bool empty() const; + [[nodiscard]] std::optional next(); + [[nodiscard]] std::string_view current_base_iri() const; + + void add_error(ParsingError::Type ty, std::string msg, Info const &i); + /** + * add statement to the output list, if none of the components is null + * (null is used to track an already inserted parse error for that component) + */ + void add_statement(Node subject, IRI predicate, Node object, IRI reify); + /** + * create an IRI with no checks, intended for hardcoded IRIs like reify_subject + */ + [[nodiscard]] IRI make_hardcoded_iri(std::string_view iri) const; + [[nodiscard]] IRI make_type_iri() const; + [[nodiscard]] IRI make_iri(std::string_view iri, std::string_view base, Info const &i); + [[nodiscard]] IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base, Info const &i); + /** + * create the IRI for an id_attrib, including uniqueness check + */ + [[nodiscard]] IRI make_id(std::string_view local_name, std::string_view base, Info const &i); + [[nodiscard]] Node make_bn(std::optional name, Info const &i); + /** + * creates a literal + * @param value + * @param datatype + * @param lang_tag (ignored, if datatype is set) + * @param i + * @return + */ + [[nodiscard]] Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag, Info const &i); + }; + + /** + * removes whitespace according to xml spec + */ + [[nodiscard]] static std::string_view trim_left(std::string_view v); + [[nodiscard]] static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); + [[nodiscard]] static bool iri_reserved(std::string_view uri, std::string_view local_name); + + struct StateTransition; }; } // namespace rdf4cpp::parser diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp index f9fda204..dbfb3e2c 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp @@ -1,18 +1,18 @@ #include namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXML::BaseState::InheritedAttributeInfo IStreamQuadIterator::ImplXML::BaseState::get_inherited_attributes(ImplXML &impl, std::span const attributes) { + IStreamQuadIterator::ImplXML::BaseState::InheritedAttributeInfo IStreamQuadIterator::ImplXMLStateCollector::BaseState::get_inherited_attributes(XMLOutputQueue &out, std::span const attributes, Info const &info) { InheritedAttributeInfo r{}; for (auto const &a : attributes) { - if (ImplXML::iri_equal_pieces(base_attribute, a.uri(), a.local_name())) { + if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) { if (auto e = IRIView(a.value()).quick_validate(); e != IRIFactoryError::Ok) { - impl.add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", e, a.value())); + out.add_error(ParsingError::Type::BadIri, std::format("invalid base IRI ({}): {}", e, a.value()), info); } r.base = a.value(); - } else if (ImplXML::iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) { + } else if (iri_equal_pieces(lang_attribute, a.uri(), a.local_name())) { r.lang_tag = a.value(); } } return r; } -} \ No newline at end of file +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp index 84f9dfbf..01650b3d 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp @@ -11,9 +11,9 @@ namespace rdf4cpp::parser { */ struct IStreamQuadIterator::ImplXMLStateCollector::BaseState { // NOLINT(*-special-member-functions) virtual ~BaseState() = default; - virtual void on_characters(ImplXML &impl, std::string_view chars) = 0; - virtual void on_start_element(ImplXML &impl, std::string_view local_name, std::string_view uri, std::span attributes) = 0; - virtual void on_end_element(ImplXML &impl) = 0; + [[nodiscard]] virtual StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) = 0; + [[nodiscard]] virtual StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) = 0; + [[nodiscard]] virtual StateTransition on_end_element(XMLOutputQueue &out, Info const &info) = 0; virtual void move_to(BaseState *b) noexcept = 0; struct InheritedAttributeInfo { @@ -30,8 +30,8 @@ namespace rdf4cpp::parser { static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; - static InheritedAttributeInfo get_inherited_attributes(ImplXML &impl, std::span attributes); + static InheritedAttributeInfo get_inherited_attributes(XMLOutputQueue &out, std::span attributes, Info const &info); }; -} +} // namespace rdf4cpp::parser #endif //RDF4CPP_XMLPARSERBASESTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp index 51d72a8e..0bb4c2e2 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp @@ -1,39 +1,38 @@ #include -#include - namespace rdf4cpp::parser { - void IStreamQuadIterator::ImplXML::CollectionState::on_characters(ImplXML &i, std::string_view const chars) { - if (!ImplXML::trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected element, found characters"); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::CollectionState::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { + if (!trim_left(chars).empty()) { + out.add_error(ParsingError::Type::BadSyntax, "expected element, found characters", info); } + return {}; } - void IStreamQuadIterator::ImplXML::CollectionState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - DescriptionState::enter(i, local_name, uri, attributes, [&](Node const obj) { - if (first) { - first = false; - last_bn = i.make_bn(std::nullopt); - i.add_statement(subject, predicate, last_bn, reify); - } else { - auto const bn = i.make_bn(std::nullopt); - i.add_statement(last_bn, i.make_hardcoded_iri(iri_rest), bn, IRI::make_null()); - last_bn = bn; - } - i.add_statement(last_bn, i.make_hardcoded_iri(iri_first), obj, IRI::make_null()); - }); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::CollectionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, Info const &info) { + auto [transition, obj] = DescriptionState::enter(out, local_name, uri, attributes, info); + if (first) { + first = false; + last_bn = out.make_bn(std::nullopt, info); + out.add_statement(subject, predicate, last_bn, reify); + } else { + auto const bn = out.make_bn(std::nullopt, info); + out.add_statement(last_bn, out.make_hardcoded_iri(iri_rest), bn, IRI::make_null()); + last_bn = bn; + } + out.add_statement(last_bn, out.make_hardcoded_iri(iri_first), obj, IRI::make_null()); + return transition; } - void IStreamQuadIterator::ImplXML::CollectionState::on_end_element(ImplXML &i) { - auto const nil = i.make_hardcoded_iri(iri_nil); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::CollectionState::on_end_element(XMLOutputQueue &out, [[maybe_unused]] Info const &info) { + auto const nil = out.make_hardcoded_iri(iri_nil); if (first) { - i.add_statement(subject, predicate, nil, reify); + out.add_statement(subject, predicate, nil, reify); } else { - i.add_statement(last_bn, i.make_hardcoded_iri(iri_rest), nil, IRI::make_null()); + out.add_statement(last_bn, out.make_hardcoded_iri(iri_rest), nil, IRI::make_null()); } - i.pop_state(); + return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXML::CollectionState::move_to(BaseState *b) noexcept { + void IStreamQuadIterator::ImplXMLStateCollector::CollectionState::move_to(BaseState *b) noexcept { new (b) CollectionState(std::move(*this)); } -} \ No newline at end of file +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp index 75d1510d..d23e7157 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp @@ -8,9 +8,9 @@ namespace rdf4cpp::parser { * state for https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt */ struct IStreamQuadIterator::ImplXMLStateCollector::CollectionState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; void move_to(BaseState *b) noexcept override; Node subject; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp index 9d64dffa..21203cc0 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp @@ -1,19 +1,20 @@ #include namespace rdf4cpp::parser { - void IStreamQuadIterator::ImplXML::DescriptionState::on_characters(ImplXML &i, std::string_view const chars) { - if (!ImplXML::trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters"); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { + if (!trim_left(chars).empty()) { + out.add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters", info); } + return {}; } - void IStreamQuadIterator::ImplXML::DescriptionState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span attributes) { - auto const inherited_attribute_info = get_inherited_attributes(i, attributes); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span attributes, Info const &info) { + auto const inherited_attribute_info = get_inherited_attributes(out, attributes, info); IRI predicate; - if (ImplXML::iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { - predicate = i.make_iri(std::format("http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}", list_current++), inherited_attribute_info.base); + if (iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { + predicate = out.make_iri(std::format("http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}", list_current++), inherited_attribute_info.base, info); } else { - predicate = i.make_iri(uri, local_name, inherited_attribute_info.base); + predicate = out.make_iri(uri, local_name, inherited_attribute_info.base, info); } std::optional datatype = std::nullopt; std::optional sub = std::nullopt; @@ -22,15 +23,15 @@ namespace rdf4cpp::parser { bool parse_literal = false; bool parse_collection = false; for (auto const &att : attributes) { - if (ImplXML::iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { - datatype = i.make_iri(att.value(), inherited_attribute_info.base); - } else if (ImplXML::iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { - sub = i.make_iri(att.value(), inherited_attribute_info.base); - } else if (ImplXML::iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { - sub = i.make_bn(att.value()); - } else if (ImplXML::iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { - reify = i.make_id(att.value(), inherited_attribute_info.base); - } else if (ImplXML::iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { + if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { + datatype = out.make_iri(att.value(), inherited_attribute_info.base, info); + } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { + sub = out.make_iri(att.value(), inherited_attribute_info.base, info); + } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { + sub = out.make_bn(att.value(), info); + } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + reify = out.make_id(att.value(), inherited_attribute_info.base, info); + } else if (iri_equal_pieces(PredicateState::parse_type_attrib, att.uri(), att.local_name())) { if (att.value() == PredicateState::parse_type_resource) { parse_resource = true; } else if (att.value() == PredicateState::parse_type_collection) { @@ -45,44 +46,97 @@ namespace rdf4cpp::parser { continue; } if (!sub.has_value()) { - sub = i.make_bn(std::nullopt); + sub = out.make_bn(std::nullopt, info); } - if (ImplXML::iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = i.make_iri(att.value(), base); - i.add_statement(*sub, i.make_type_iri(), obj, IRI::make_null()); + if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { + IRI const obj = out.make_iri(att.value(), base, info); + out.add_statement(*sub, out.make_type_iri(), obj, IRI::make_null()); } else { - IRI const pred = i.make_iri(att.uri(), att.local_name(), base); - Literal const obj = i.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag); - i.add_statement(*sub, pred, obj, IRI::make_null()); + IRI const pred = out.make_iri(att.uri(), att.local_name(), base, info); + Literal const obj = out.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag, info); + out.add_statement(*sub, pred, obj, IRI::make_null()); } } if (sub.has_value() && (parse_collection || parse_literal || parse_resource)) { - i.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); + out.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource", info); } if (datatype.has_value()) { - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, *datatype); + return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, *datatype); } else if (sub.has_value()) { - i.add_statement(subject, predicate, *sub, reify); - i.state_stack_.emplace_back(std::in_place_type_t{}); + out.add_statement(subject, predicate, *sub, reify); + return StateTransition(std::in_place_type_t{}); } else if (parse_resource) { - Node const obj = i.make_bn(std::nullopt); - i.add_statement(subject, predicate, obj, reify); - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, obj); + Node const obj = out.make_bn(std::nullopt, info); + out.add_statement(subject, predicate, obj, reify); + return StateTransition(std::in_place_type_t{}, inherited_attribute_info, obj); } else if (parse_literal) { - auto &xml_state = i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); - static_cast(xml_state.get()).source_input(i); // NOLINT(*-pro-type-static-cast-downcast) + return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, info); } else if (parse_collection) { - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); + return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); } else { - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); + return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); } - i.update_current_state(); } - void IStreamQuadIterator::ImplXML::DescriptionState::on_end_element(ImplXML &i) { - i.pop_state(); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] Info const &info) { + return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXML::DescriptionState::move_to(BaseState *b) noexcept { + void IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::move_to(BaseState *b) noexcept { new (b) DescriptionState(std::move(*this)); } -} \ No newline at end of file + std::pair IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::enter(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) { + auto const inherited_attribute_info = get_inherited_attributes(out, attributes, info); + Node sub = Node::make_null(); + auto check_only_one = [&sub, &out, &info]() { + if (!sub.null()) { + out.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:ID, rdf:about, and rdf:nodeID", info); + return true; + } + return false; + }; + for (auto const &att : attributes) { + if (iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { + if (check_only_one()) { + continue; + } + sub = out.make_iri(att.value(), inherited_attribute_info.base, info); + } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { + if (check_only_one()) { + continue; + } + sub = out.make_id(att.value(), inherited_attribute_info.base, info); + } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { + if (check_only_one()) { + continue; + } + sub = out.make_bn(att.value(), info); + } + } + if (sub.null()) { + sub = out.make_bn(std::nullopt, info); + } + if (!iri_equal_pieces(start_element, uri, local_name)) { + IRI const obj = out.make_iri(uri, local_name, inherited_attribute_info.base, info); + if (!obj.null()) { + out.add_statement(sub, out.make_type_iri(), obj, IRI::make_null()); + } + } + for (auto const &att : attributes) { + if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { + continue; + } + if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { + IRI const obj = out.make_iri(att.value(), inherited_attribute_info.base, info); + out.add_statement(sub, out.make_type_iri(), obj, IRI::make_null()); + } else { + IRI const pred = out.make_iri(att.uri(), att.local_name(), inherited_attribute_info.base, info); + Literal const obj = out.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag, info); + out.add_statement(sub, pred, obj, IRI::make_null()); + } + } + return { + StateTransition{std::in_place_type_t{}, inherited_attribute_info, sub}, + sub, + }; + } +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp index 04d8e6eb..5bddb1c0 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp @@ -11,9 +11,9 @@ namespace rdf4cpp::parser { * (https://www.w3.org/TR/rdf11-xml/#parseTypeResourcePropertyElt has no own state, instead gets handled directly by on_start_element) */ struct IStreamQuadIterator::ImplXMLStateCollector::DescriptionState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; void move_to(BaseState *b) noexcept override; Node subject; @@ -23,11 +23,7 @@ namespace rdf4cpp::parser { : BaseState(i), subject(sub) { } - /** - * include XMLParserDescriptionStateEnter.hpp to use it - */ - template - static void enter(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes, F f); + static std::pair enter(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info); static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionStateEnter.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionStateEnter.hpp deleted file mode 100644 index 0636ea26..00000000 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionStateEnter.hpp +++ /dev/null @@ -1,59 +0,0 @@ -#include - -namespace rdf4cpp::parser { - template - void IStreamQuadIterator::ImplXML::DescriptionState::enter(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes, F f) { - auto const inherited_attribute_info = get_inherited_attributes(i, attributes); - Node sub = Node::make_null(); - auto check_only_one = [&sub, &i]() { - if (!sub.null()) { - i.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:ID, rdf:about, and rdf:nodeID"); - return true; - } - return false; - }; - for (auto const &att : attributes) { - if (ImplXML::iri_equal_pieces(about_attrib, att.uri(), att.local_name())) { - if (check_only_one()) { - continue; - } - sub = i.make_iri(att.value(), inherited_attribute_info.base); - } else if (ImplXML::iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { - if (check_only_one()) { - continue; - } - sub = i.make_id(att.value(), inherited_attribute_info.base); - } else if (ImplXML::iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { - if (check_only_one()) { - continue; - } - sub = i.make_bn(att.value()); - } - } - if (sub.null()) { - sub = i.make_bn(std::nullopt); - } - if (!ImplXML::iri_equal_pieces(start_element, uri, local_name)) { - IRI const obj = i.make_iri(uri, local_name, inherited_attribute_info.base); - if (!obj.null()) { - i.add_statement(sub, i.make_type_iri(), obj, IRI::make_null()); - } - } - for (auto const &att : attributes) { - if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { - continue; - } - if (ImplXML::iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { - IRI const obj = i.make_iri(att.value(), inherited_attribute_info.base); - i.add_statement(sub, i.make_type_iri(), obj, IRI::make_null()); - } else { - IRI const pred = i.make_iri(att.uri(), att.local_name(), inherited_attribute_info.base); - Literal const obj = i.make_literal(att.value(), std::nullopt, inherited_attribute_info.lang_tag); - i.add_statement(sub, pred, obj, IRI::make_null()); - } - } - f(sub); - i.state_stack_.emplace_back(std::in_place_type_t{}, inherited_attribute_info, sub); - i.update_current_state(); - } -} \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp index 7b488fcd..482d6d01 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp @@ -1,20 +1,22 @@ #include namespace rdf4cpp::parser { - void IStreamQuadIterator::ImplXML::EmptyElement::on_characters(ImplXML &i, std::string_view const chars) { - if (!ImplXML::trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters"); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::EmptyElement::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { + if (!trim_left(chars).empty()) { + out.add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters", info); } + return {}; } - void IStreamQuadIterator::ImplXML::EmptyElement::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???"); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::EmptyElement::on_start_element(XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, Info const &info) { + out.add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???", info); + return {}; } - void IStreamQuadIterator::ImplXML::EmptyElement::on_end_element(ImplXML &i) { - i.pop_state(); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::EmptyElement::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] Info const &info) { + return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXML::EmptyElement::move_to(BaseState *b) noexcept { + void IStreamQuadIterator::ImplXMLStateCollector::EmptyElement::move_to(BaseState *b) noexcept { new (b) EmptyElement(std::move(*this)); } -} \ No newline at end of file +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp index cc47038f..77135624 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp @@ -8,9 +8,9 @@ namespace rdf4cpp::parser { * state for https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (if attributes are present) */ struct IStreamQuadIterator::ImplXMLStateCollector::EmptyElement final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; void move_to(BaseState *b) noexcept override; EmptyElement() diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp index 29b35271..20e37bc6 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp @@ -1,25 +1,29 @@ #include namespace rdf4cpp::parser { - void IStreamQuadIterator::ImplXML::InitialState::on_characters(ImplXML &i, std::string_view const chars) { - if (!ImplXML::trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters"); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::InitialState::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { + if (!trim_left(chars).empty()) { + out.add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters", info); } + return {}; } - void IStreamQuadIterator::ImplXML::InitialState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes) { - if (ImplXML::iri_equal_pieces(RDFState::start_element, uri, local_name)) { - i.state_stack_.emplace_back(std::in_place_type_t{}, get_inherited_attributes(i, attributes)); - i.update_current_state(); - return; + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::InitialState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes, Info const &info) { + if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { + return StateTransition{ + std::in_place_type_t{}, + get_inherited_attributes(out, attributes, info), + }; } - i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???"); + out.add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???", info); + return {}; } - void IStreamQuadIterator::ImplXML::InitialState::on_end_element(ImplXML &i) { - i.add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???"); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::InitialState::on_end_element(XMLOutputQueue &out, Info const &info) { + out.add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???", info); + return {}; } - void IStreamQuadIterator::ImplXML::InitialState::move_to(BaseState *b) noexcept { + void IStreamQuadIterator::ImplXMLStateCollector::InitialState::move_to(BaseState *b) noexcept { new (b) InitialState(std::move(*this)); } -} \ No newline at end of file +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp index ab341f60..caf0d42c 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp @@ -9,9 +9,9 @@ namespace rdf4cpp::parser { * initial state, checks for start of https://www.w3.org/TR/rdf11-xml/#RDF */ struct IStreamQuadIterator::ImplXMLStateCollector::InitialState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; void move_to(BaseState *b) noexcept override; InitialState() diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp index 5d656bca..0df77bad 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp @@ -1,47 +1,46 @@ #include -#include - namespace rdf4cpp::parser { - void IStreamQuadIterator::ImplXML::PredicateState::on_characters([[maybe_unused]] ImplXML &i, std::string_view const chars) { + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::PredicateState::on_characters([[maybe_unused]] XMLOutputQueue &out, std::string_view const chars, Info const &info) { if (done) { - if (!ImplXML::trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal"); + if (!trim_left(chars).empty()) { + out.add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal", info); } - return; + return {}; } literal.append(chars); + return {}; } - void IStreamQuadIterator::ImplXML::PredicateState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - if (!ImplXML::trim_left(literal).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element"); - return; + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::PredicateState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, Info const &info) { + if (!trim_left(literal).empty()) { + out.add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element", info); + return {}; } if (done) { - i.add_error(ParsingError::Type::BadSyntax, "expected end of element, found element"); - return; + out.add_error(ParsingError::Type::BadSyntax, "expected end of element, found element", info); + return {}; } - DescriptionState::enter(i, local_name, uri, attributes, [&](Node obj) { - done = true; - i.add_statement(subject, predicate, obj, reify); - }); + auto [transition, obj] = DescriptionState::enter(out, local_name, uri, attributes, info); + done = true; + out.add_statement(subject, predicate, obj, reify); + return transition; } - void IStreamQuadIterator::ImplXML::PredicateState::on_end_element(ImplXML &i) { + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::PredicateState::on_end_element(XMLOutputQueue &out, Info const &info) { if (!done) { - Literal const lit = i.make_literal(literal, std::nullopt, std::nullopt); - i.add_statement(subject, predicate, lit, reify); + Literal const lit = out.make_literal(literal, std::nullopt, std::nullopt, info); + out.add_statement(subject, predicate, lit, reify); } - i.pop_state(); + return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXML::PredicateState::move_to(BaseState *b) noexcept { + void IStreamQuadIterator::ImplXMLStateCollector::PredicateState::move_to(BaseState *b) noexcept { new (b) PredicateState(std::move(*this)); } - bool IStreamQuadIterator::ImplXML::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { - return ImplXML::iri_reserved(uri, local_name) || ImplXML::iri_equal_pieces(DescriptionState::start_element, uri, local_name) || ImplXML::iri_equal_pieces(list_start_element, uri, local_name); + bool IStreamQuadIterator::ImplXMLStateCollector::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { + return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(list_start_element, uri, local_name); } -} \ No newline at end of file +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp index 20afa6e5..c9e8d8e1 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp @@ -11,9 +11,9 @@ namespace rdf4cpp::parser { * and https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (with no attributes (empty literal)) */ struct IStreamQuadIterator::ImplXMLStateCollector::PredicateState : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; void move_to(BaseState *b) noexcept override; Node subject; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp index 03ef47df..f745a2e9 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp @@ -1,23 +1,22 @@ #include -#include - namespace rdf4cpp::parser { - void IStreamQuadIterator::ImplXML::RDFState::on_characters(ImplXML &i, std::string_view const chars) { - if (!ImplXML::trim_left(chars).empty()) { - i.add_error(ParsingError::Type::BadSyntax, "expected Description, found characters"); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::RDFState::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { + if (!trim_left(chars).empty()) { + out.add_error(ParsingError::Type::BadSyntax, "expected Description, found characters", info); } + return {}; } - void IStreamQuadIterator::ImplXML::RDFState::on_start_element(ImplXML &i, std::string_view const local_name, std::string_view const uri, std::span const attributes) { - DescriptionState::enter(i, local_name, uri, attributes, [](auto) { - }); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::RDFState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, Info const &info) { + auto [trans, _] = DescriptionState::enter(out, local_name, uri, attributes, info); + return trans; } - void IStreamQuadIterator::ImplXML::RDFState::on_end_element(ImplXML &i) { - i.pop_state(); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::RDFState::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] Info const &info) { + return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXML::RDFState::move_to(BaseState *b) noexcept { + void IStreamQuadIterator::ImplXMLStateCollector::RDFState::move_to(BaseState *b) noexcept { new (b) RDFState(std::move(*this)); } } // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp index 34d68fa7..b05518e1 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp @@ -9,9 +9,9 @@ namespace rdf4cpp::parser { * state for https://www.w3.org/TR/rdf11-xml/#RDF */ struct IStreamQuadIterator::ImplXMLStateCollector::RDFState final : BaseState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; void move_to(BaseState *b) noexcept override; static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp index c0380903..0cc97682 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp @@ -1,18 +1,19 @@ #include namespace rdf4cpp::parser { - void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { - i.add_error(ParsingError::Type::BadSyntax, "expected literal, found element"); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState::on_start_element(XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, Info const &info) { + out.add_error(ParsingError::Type::BadSyntax, "expected literal, found element", info); + return {}; } - void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::on_end_element(ImplXML &i) { + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState::on_end_element(XMLOutputQueue &out, Info const &info) { if (!datatype.null()) { - Literal const lit = i.make_literal(literal, datatype, std::nullopt); - i.add_statement(subject, predicate, lit, reify); + Literal const lit = out.make_literal(literal, datatype, std::nullopt, info); + out.add_statement(subject, predicate, lit, reify); } - i.pop_state(); + return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXML::TypedLiteralPredicateState::move_to(BaseState *b) noexcept { + void IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState::move_to(BaseState *b) noexcept { new (b) TypedLiteralPredicateState(std::move(*this)); } -} \ No newline at end of file +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp index e002dcc8..66de9a2a 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp @@ -9,8 +9,8 @@ namespace rdf4cpp::parser { * state for https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (with datatype attribute) */ struct IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState final : PredicateState { - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; void move_to(BaseState *b) noexcept override; IRI datatype; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp index 6715a39a..7cb4df12 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp @@ -1,22 +1,24 @@ #include namespace rdf4cpp::parser { - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_characters(ImplXML &i, [[maybe_unused]] std::string_view chars) { - source_input(i); + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::on_characters([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] std::string_view chars, Info const &info) { + source_input(info); + return {}; } - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_start_element(ImplXML &i, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes) { + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::on_start_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, Info const &info) { ++depth; - source_input(i); + source_input(info); + return {}; } - void IStreamQuadIterator::ImplXML::XMLLiteralState::on_end_element(ImplXML &i) { + IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::on_end_element(XMLOutputQueue &out, Info const &info) { if (depth > 0) { --depth; - source_input(i); - return; + source_input(info); + return {}; } - IRI datatype = i.make_hardcoded_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"); + IRI datatype = out.make_hardcoded_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"); std::string_view l = literal; l = l.substr(0, last_offset); l.remove_prefix(data_start); @@ -26,20 +28,17 @@ namespace rdf4cpp::parser { if (!l.empty() && l[0] == '>') { l.remove_prefix(1); } - Literal const lit = i.make_literal(l, datatype, std::nullopt); - i.add_statement(subject, predicate, lit, reify); - i.pop_state(); + Literal const lit = out.make_literal(l, datatype, std::nullopt, info); + out.add_statement(subject, predicate, lit, reify); + return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXML::XMLLiteralState::move_to(BaseState *b) noexcept { + void IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::move_to(BaseState *b) noexcept { new (b) XMLLiteralState(std::move(*this)); } - void IStreamQuadIterator::ImplXML::XMLLiteralState::source_input(ImplXML &i) { - xmlChar const *data; - int size = 1024; - int off = 0; - xmlCtxtGetInputWindow(i.context_.get(), 0, &data, &size, &off); - std::string_view const sv{reinterpret_cast(data), static_cast(size)}; + void IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::source_input(Info const &info) { + int const off = info.source_offset; + std::string_view const sv = info.source; if (literal.empty()) { data_start = off; } @@ -49,4 +48,4 @@ namespace rdf4cpp::parser { } last_offset = static_cast(off) + last_size; } -} \ No newline at end of file +} // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp index 8efd6e19..0bdeb5fe 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp @@ -10,9 +10,9 @@ namespace rdf4cpp::parser { * (and https://www.w3.org/TR/rdf11-xml/#parseTypeOtherPropertyElt) */ struct IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState final : PredicateState { - void on_characters(ImplXML &i, std::string_view chars) override; - void on_start_element(ImplXML &i, std::string_view local_name, std::string_view uri, std::span attributes) override; - void on_end_element(ImplXML &i) override; + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; void move_to(BaseState *b) noexcept override; size_t depth = 0; @@ -20,9 +20,11 @@ namespace rdf4cpp::parser { size_t last_offset = 0; size_t last_size = 0; - using PredicateState::PredicateState; + void source_input(Info const &info); - void source_input(ImplXML &i); + XMLLiteralState(InheritedAttributeInfo const &i, Node sub, IRI predicate, IRI reify, Info const &info) : PredicateState(i, sub, predicate, reify) { + source_input(info); + } }; } From 7417925d5daf77c06823776b0ca4a9959e2f1918 Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 11 Dec 2025 13:29:13 +0100 Subject: [PATCH 34/42] reorganize classes --- CMakeLists.txt | 2 +- private/rdf4cpp/parser/XMLParser.cpp | 28 +-- private/rdf4cpp/parser/XMLParser.hpp | 20 ++- .../parser/XMLParserStateCollector.hpp | 165 ------------------ ...tateCollector.cpp => XMLParserUtility.cpp} | 36 ++-- private/rdf4cpp/parser/XMLParserUtility.hpp | 147 ++++++++++++++++ .../parser/XMLStates/XMLParserBaseState.cpp | 6 +- .../parser/XMLStates/XMLParserBaseState.hpp | 16 +- .../XMLStates/XMLParserCollectionState.cpp | 12 +- .../XMLStates/XMLParserCollectionState.hpp | 14 +- .../XMLStates/XMLParserDescriptionState.cpp | 14 +- .../XMLStates/XMLParserDescriptionState.hpp | 16 +- .../XMLStates/XMLParserEmptyElement.cpp | 12 +- .../XMLStates/XMLParserEmptyElement.hpp | 14 +- .../XMLStates/XMLParserInitialState.cpp | 12 +- .../XMLStates/XMLParserInitialState.hpp | 14 +- .../XMLStates/XMLParserPredicateState.cpp | 14 +- .../XMLStates/XMLParserPredicateState.hpp | 14 +- .../parser/XMLStates/XMLParserRDFState.cpp | 12 +- .../parser/XMLStates/XMLParserRDFState.hpp | 14 +- .../XMLParserTypedLiteralPredicateState.cpp | 10 +- .../XMLParserTypedLiteralPredicateState.hpp | 12 +- .../XMLStates/XMLParserXMLLiteralState.cpp | 14 +- .../XMLStates/XMLParserXMLLiteralState.hpp | 18 +- 24 files changed, 312 insertions(+), 324 deletions(-) delete mode 100644 private/rdf4cpp/parser/XMLParserStateCollector.hpp rename private/rdf4cpp/parser/{XMLParserStateCollector.cpp => XMLParserUtility.cpp} (68%) create mode 100644 private/rdf4cpp/parser/XMLParserUtility.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 851d4e78..b3db51cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,7 +151,7 @@ add_library(rdf4cpp src/rdf4cpp/util/Anonymizer.cpp private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp private/rdf4cpp/parser/XMLParser.cpp - private/rdf4cpp/parser/XMLParserStateCollector.cpp + private/rdf4cpp/parser/XMLParserUtility.cpp private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 1f02b9ec..3cb9d87b 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -22,7 +22,7 @@ namespace rdf4cpp::parser { int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { auto *t = static_cast(th); t->handle_state_transition(t->current_state_->on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), - std::span{reinterpret_cast(attributes), static_cast(n_attributes)}, t->make_info())); + std::span{reinterpret_cast(attributes), static_cast(n_attributes)}, t->make_info())); }; r.endElementNs = [](void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { auto *t = static_cast(th); @@ -61,17 +61,17 @@ namespace rdf4cpp::parser { } // implemented here, to have access to states - bool IStreamQuadIterator::ImplXMLStateCollector::iri_reserved(std::string_view const uri, std::string_view const local_name) { + bool iri_reserved(std::string_view const uri, std::string_view const local_name) { static constexpr std::array reserved = { - RDFState::start_element, - DescriptionState::id_attrib, - DescriptionState::about_attrib, - PredicateState::parse_type_attrib, - PredicateState::resource_attrib, - DescriptionState::node_id_attrib, - TypedLiteralPredicateState::datatype_attrib, - BaseState::base_attribute, - BaseState::lang_attribute, + xml_states::RDFState::start_element, + xml_states::DescriptionState::id_attrib, + xml_states::DescriptionState::about_attrib, + xml_states::PredicateState::parse_type_attrib, + xml_states::PredicateState::resource_attrib, + xml_states::DescriptionState::node_id_attrib, + xml_states::TypedLiteralPredicateState::datatype_attrib, + xml_states::BaseState::base_attribute, + xml_states::BaseState::lang_attribute, std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"), std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"), std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"), @@ -97,7 +97,7 @@ namespace rdf4cpp::parser { va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } - IStreamQuadIterator::ImplXMLStateCollector::Info IStreamQuadIterator::ImplXML::make_info() const { + XMLStateInfo IStreamQuadIterator::ImplXML::make_info() const { std::string_view base = ""; for (auto const &s : state_stack_ | std::ranges::views::reverse) { std::string_view const v = s.get().base; @@ -122,7 +122,7 @@ namespace rdf4cpp::parser { xmlCtxtGetInputWindow(context_.get(), 0, &data, &size, &off); std::string_view const source{reinterpret_cast(data), static_cast(size)}; - return Info{ + return XMLStateInfo{ static_cast(xmlSAX2GetLineNumber(context_.get())), static_cast(xmlSAX2GetColumnNumber(context_.get())), base, @@ -139,7 +139,7 @@ namespace rdf4cpp::parser { output_(state) { xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); state_stack_.reserve(10); - state_stack_.emplace_back(std::in_place_type_t{}); + state_stack_.emplace_back(std::in_place_type_t{}); update_current_state(); current_state_->base = output_.current_base_iri(); diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp index 9eb7e476..2cd058d9 100644 --- a/private/rdf4cpp/parser/XMLParser.hpp +++ b/private/rdf4cpp/parser/XMLParser.hpp @@ -16,7 +16,7 @@ #include -#include +#include #include @@ -37,7 +37,7 @@ #include namespace rdf4cpp::parser { - struct IStreamQuadIterator::ImplXML final : ImplXMLStateCollector { + struct IStreamQuadIterator::ImplXML final : Impl { private: xmlSAXHandler handler_; // workaround for gcc-14 bug, erroneously warns on unsing a lambda here @@ -54,8 +54,12 @@ namespace rdf4cpp::parser { EOFFunc eof_func_; XMLOutputQueue output_; - BaseState *current_state_ = nullptr; - std::vector> state_stack_; + xml_states::BaseState *current_state_ = nullptr; + std::vector> + state_stack_; static xmlSAXHandler make_sax_handler(); @@ -65,7 +69,7 @@ namespace rdf4cpp::parser { static void on_error(void *th, char const *msg, ...); - [[nodiscard]] Info make_info() const; + [[nodiscard]] XMLStateInfo make_info() const; public: ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); @@ -82,8 +86,10 @@ namespace rdf4cpp::parser { [[nodiscard]] uint64_t current_column() const noexcept override; }; - struct IStreamQuadIterator::ImplXMLStateCollector::StateTransition { - using ModifyStateStack = std::variant; + struct StateTransition { + using ModifyStateStack = std::variant; ModifyStateStack modify_state; diff --git a/private/rdf4cpp/parser/XMLParserStateCollector.hpp b/private/rdf4cpp/parser/XMLParserStateCollector.hpp deleted file mode 100644 index 12c0e957..00000000 --- a/private/rdf4cpp/parser/XMLParserStateCollector.hpp +++ /dev/null @@ -1,165 +0,0 @@ -#ifndef RDF4CPP_XMLPARSERSTATECOLLECTOR_H -#define RDF4CPP_XMLPARSERSTATECOLLECTOR_H - -#include - -#include - -#include - -#include -#include -#include -#include - -#include - -#include - -namespace rdf4cpp::parser { - struct IStreamQuadIterator::ImplXMLStateCollector : Impl { - protected: - static std::string_view from_xml_char(xmlChar const *s) { - if (s == nullptr) { - return ""; - } - // ReSharper disable once CppDFALocalValueEscapesFunction - return {reinterpret_cast(s)}; - } - - static std::string_view from_xml_char(xmlChar const *s, xmlChar const *e) { - if (s == nullptr) { - return ""; - } - // ReSharper disable once CppDFALocalValueEscapesFunction - return {reinterpret_cast(s), reinterpret_cast(e)}; - } - - static std::string_view from_xml_char(xmlChar const *s, int const n) { - if (s == nullptr) { - return ""; - } - // ReSharper disable once CppDFALocalValueEscapesFunction - return {reinterpret_cast(s), static_cast(n)}; - } - - struct Attribute { - xmlChar const *local_name_raw; - xmlChar const *prefix_raw; - xmlChar const *uri_raw; - xmlChar const *value_start_raw; - xmlChar const *value_end_raw; - - [[nodiscard]] std::string_view value() const { - return from_xml_char(value_start_raw, value_end_raw); - } - - [[nodiscard]] std::string_view local_name() const { - return from_xml_char(local_name_raw); - } - - [[nodiscard]] std::string_view uri() const { - return from_xml_char(uri_raw); - } - }; - - struct BaseState; - - struct InitialState; - - struct RDFState; - - struct DescriptionState; - - struct PredicateState; - - struct TypedLiteralPredicateState; - - struct XMLLiteralState; - - struct CollectionState; - - struct EmptyElement; - - struct PopState {}; - struct NoStateChange {}; - - struct Info { - uint64_t line; - uint64_t column; - std::string_view base; - std::string_view lang_tag; - std::string_view source; - int source_offset; - }; - - struct XMLOutputQueue { - private: - std::deque result_queue_; - size_t next_bn_index_ = 0; - state_type *state_; - bool state_is_owned_ = false; - dice::sparse_map::sparse_set reserved_ids_; - - static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; - static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; - static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; - static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; - - template - [[nodiscard]] NT inspect_node(NT node, Info const &i); - - public: - explicit XMLOutputQueue(state_type *state); - ~XMLOutputQueue(); - - XMLOutputQueue(XMLOutputQueue const &) = delete; - XMLOutputQueue &operator=(XMLOutputQueue const &) = delete; - XMLOutputQueue(XMLOutputQueue &&) = delete; - XMLOutputQueue &operator=(XMLOutputQueue &&) = delete; - - [[nodiscard]] bool empty() const; - [[nodiscard]] std::optional next(); - [[nodiscard]] std::string_view current_base_iri() const; - - void add_error(ParsingError::Type ty, std::string msg, Info const &i); - /** - * add statement to the output list, if none of the components is null - * (null is used to track an already inserted parse error for that component) - */ - void add_statement(Node subject, IRI predicate, Node object, IRI reify); - /** - * create an IRI with no checks, intended for hardcoded IRIs like reify_subject - */ - [[nodiscard]] IRI make_hardcoded_iri(std::string_view iri) const; - [[nodiscard]] IRI make_type_iri() const; - [[nodiscard]] IRI make_iri(std::string_view iri, std::string_view base, Info const &i); - [[nodiscard]] IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base, Info const &i); - /** - * create the IRI for an id_attrib, including uniqueness check - */ - [[nodiscard]] IRI make_id(std::string_view local_name, std::string_view base, Info const &i); - [[nodiscard]] Node make_bn(std::optional name, Info const &i); - /** - * creates a literal - * @param value - * @param datatype - * @param lang_tag (ignored, if datatype is set) - * @param i - * @return - */ - [[nodiscard]] Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag, Info const &i); - }; - - /** - * removes whitespace according to xml spec - */ - [[nodiscard]] static std::string_view trim_left(std::string_view v); - [[nodiscard]] static bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); - [[nodiscard]] static bool iri_reserved(std::string_view uri, std::string_view local_name); - - struct StateTransition; - }; -} // namespace rdf4cpp::parser - -#endif //RDF4CPP_XMLPARSERSTATECOLLECTOR_H diff --git a/private/rdf4cpp/parser/XMLParserStateCollector.cpp b/private/rdf4cpp/parser/XMLParserUtility.cpp similarity index 68% rename from private/rdf4cpp/parser/XMLParserStateCollector.cpp rename to private/rdf4cpp/parser/XMLParserUtility.cpp index 693cec2b..ddda1683 100644 --- a/private/rdf4cpp/parser/XMLParserStateCollector.cpp +++ b/private/rdf4cpp/parser/XMLParserUtility.cpp @@ -1,25 +1,25 @@ -#include +#include namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::XMLOutputQueue(state_type *state) : state_(state) { + XMLOutputQueue::XMLOutputQueue(state_type *state) : state_(state) { if (state_ == nullptr) { state_ = new state_type(); state_is_owned_ = true; } } - IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::~XMLOutputQueue() { + XMLOutputQueue::~XMLOutputQueue() { if (state_is_owned_) { delete state_; } } - bool IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::empty() const { + bool XMLOutputQueue::empty() const { return result_queue_.empty(); } - std::optional IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::next() { + std::optional XMLOutputQueue::next() { if (result_queue_.empty()) { return std::nullopt; } @@ -28,15 +28,15 @@ namespace rdf4cpp::parser { return r; } - std::string_view IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::current_base_iri() const { + std::string_view XMLOutputQueue::current_base_iri() const { return state_->iri_factory.get_base(); } - void IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::add_error(ParsingError::Type ty, std::string msg, Info const &i) { + void XMLOutputQueue::add_error(ParsingError::Type ty, std::string msg, XMLStateInfo const &i) { result_queue_.emplace_back(nonstd::unexpect, ty, i.line, i.column, std::move(msg)); } - void IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::add_statement(Node subject, IRI predicate, Node object, IRI reify) { + void XMLOutputQueue::add_statement(Node subject, IRI predicate, Node object, IRI reify) { if (subject.null() || predicate.null() || object.null()) { return; } @@ -49,16 +49,16 @@ namespace rdf4cpp::parser { } } - IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_hardcoded_iri(std::string_view const iri) const { + IRI XMLOutputQueue::make_hardcoded_iri(std::string_view const iri) const { return IRI::make_unchecked(iri, state_->node_storage); } - IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_type_iri() const { + IRI XMLOutputQueue::make_type_iri() const { return IRI::rdf_type(state_->node_storage); } template - NT IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::inspect_node(NT node, Info const &i) { + NT XMLOutputQueue::inspect_node(NT node, XMLStateInfo const &i) { try { state_->inspect_node_func(node); return node; @@ -70,7 +70,7 @@ namespace rdf4cpp::parser { return NT::make_null(); } - IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_iri(std::string_view const iri, std::string_view const base, Info const &i) { + IRI XMLOutputQueue::make_iri(std::string_view const iri, std::string_view const base, XMLStateInfo const &i) { if (base.empty()) { state_->iri_factory.set_base_unchecked(i.base); } else { @@ -85,13 +85,13 @@ namespace rdf4cpp::parser { } } - IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_iri(std::string_view const uri, std::string_view const local_name, std::string_view const base, Info const &i) { + IRI XMLOutputQueue::make_iri(std::string_view const uri, std::string_view const local_name, std::string_view const base, XMLStateInfo const &i) { std::string iri{uri}; iri.append(local_name); return make_iri(iri, base, i); } - IRI IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_id(std::string_view const local_name, std::string_view const base, Info const &i) { + IRI XMLOutputQueue::make_id(std::string_view const local_name, std::string_view const base, XMLStateInfo const &i) { std::string local = "#"; local.append(local_name); auto iri = make_iri(local, base, i); @@ -103,7 +103,7 @@ namespace rdf4cpp::parser { return iri; } - Node IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_bn(std::optional name, Info const &i) { + Node XMLOutputQueue::make_bn(std::optional name, XMLStateInfo const &i) { std::string n = ""; if (!name.has_value()) { n = std::format("bn_{}", next_bn_index_++); @@ -124,7 +124,7 @@ namespace rdf4cpp::parser { } } - Literal IStreamQuadIterator::ImplXMLStateCollector::XMLOutputQueue::make_literal(std::string_view value, std::optional datatype, std::optional lang_tag, Info const &i) { + Literal XMLOutputQueue::make_literal(std::string_view value, std::optional datatype, std::optional lang_tag, XMLStateInfo const &i) { Literal l = Literal::make_null(); try { if (datatype.has_value()) { @@ -147,7 +147,7 @@ namespace rdf4cpp::parser { return inspect_node(l, i); } - std::string_view IStreamQuadIterator::ImplXMLStateCollector::trim_left(std::string_view v) { + std::string_view trim_left(std::string_view v) { auto s = v.find_first_not_of(" \t\r\n"); if (s == std::string_view::npos) { return ""; @@ -157,7 +157,7 @@ namespace rdf4cpp::parser { return v; } - bool IStreamQuadIterator::ImplXMLStateCollector::iri_equal_pieces(std::string_view const full_iri, std::string_view const uri, std::string_view const local_name) { + bool iri_equal_pieces(std::string_view const full_iri, std::string_view const uri, std::string_view const local_name) { if (full_iri.size() != local_name.size() + uri.size()) { return false; } diff --git a/private/rdf4cpp/parser/XMLParserUtility.hpp b/private/rdf4cpp/parser/XMLParserUtility.hpp new file mode 100644 index 00000000..15e99680 --- /dev/null +++ b/private/rdf4cpp/parser/XMLParserUtility.hpp @@ -0,0 +1,147 @@ +#ifndef RDF4CPP_XMLPARSERSTATECOLLECTOR_H +#define RDF4CPP_XMLPARSERSTATECOLLECTOR_H + +#include + +#include + +#include + +#include +#include +#include +#include + +#include + +#include + +namespace rdf4cpp::parser { + inline std::string_view from_xml_char(xmlChar const *s) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s)}; + } + + inline std::string_view from_xml_char(xmlChar const *s, xmlChar const *e) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s), reinterpret_cast(e)}; + } + + inline std::string_view from_xml_char(xmlChar const *s, int const n) { + if (s == nullptr) { + return ""; + } + // ReSharper disable once CppDFALocalValueEscapesFunction + return {reinterpret_cast(s), static_cast(n)}; + } + + struct XMLAttribute { + xmlChar const *local_name_raw; + xmlChar const *prefix_raw; + xmlChar const *uri_raw; + xmlChar const *value_start_raw; + xmlChar const *value_end_raw; + + [[nodiscard]] std::string_view value() const { + return from_xml_char(value_start_raw, value_end_raw); + } + + [[nodiscard]] std::string_view local_name() const { + return from_xml_char(local_name_raw); + } + + [[nodiscard]] std::string_view uri() const { + return from_xml_char(uri_raw); + } + }; + + struct XMLStateInfo { + uint64_t line; + uint64_t column; + std::string_view base; + std::string_view lang_tag; + std::string_view source; + int source_offset; + }; + + struct XMLOutputQueue { + using value_type = IStreamQuadIterator::value_type; + using state_type = IStreamQuadIterator::state_type; + + private: + std::deque result_queue_; + size_t next_bn_index_ = 0; + state_type *state_; + bool state_is_owned_ = false; + dice::sparse_map::sparse_set reserved_ids_; + + static constexpr std::string_view reify_subject = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject"; + static constexpr std::string_view reify_predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate"; + static constexpr std::string_view reify_object = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object"; + static constexpr std::string_view reify_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement"; + + template + [[nodiscard]] NT inspect_node(NT node, XMLStateInfo const &i); + + public: + explicit XMLOutputQueue(state_type *state); + ~XMLOutputQueue(); + + XMLOutputQueue(XMLOutputQueue const &) = delete; + XMLOutputQueue &operator=(XMLOutputQueue const &) = delete; + XMLOutputQueue(XMLOutputQueue &&) = delete; + XMLOutputQueue &operator=(XMLOutputQueue &&) = delete; + + [[nodiscard]] bool empty() const; + [[nodiscard]] std::optional next(); + [[nodiscard]] std::string_view current_base_iri() const; + + void add_error(ParsingError::Type ty, std::string msg, XMLStateInfo const &i); + /** + * add statement to the output list, if none of the components is null + * (null is used to track an already inserted parse error for that component) + */ + void add_statement(Node subject, IRI predicate, Node object, IRI reify); + /** + * create an IRI with no checks, intended for hardcoded IRIs like reify_subject + */ + [[nodiscard]] IRI make_hardcoded_iri(std::string_view iri) const; + [[nodiscard]] IRI make_type_iri() const; + [[nodiscard]] IRI make_iri(std::string_view iri, std::string_view base, XMLStateInfo const &i); + [[nodiscard]] IRI make_iri(std::string_view uri, std::string_view local_name, std::string_view base, XMLStateInfo const &i); + /** + * create the IRI for an id_attrib, including uniqueness check + */ + [[nodiscard]] IRI make_id(std::string_view local_name, std::string_view base, XMLStateInfo const &i); + [[nodiscard]] Node make_bn(std::optional name, XMLStateInfo const &i); + /** + * creates a literal + * @param value + * @param datatype + * @param lang_tag (ignored, if datatype is set) + * @param i + * @return + */ + [[nodiscard]] Literal make_literal(std::string_view value, std::optional datatype, std::optional lang_tag, XMLStateInfo const &i); + }; + + struct PopState {}; + struct NoStateChange {}; + + struct StateTransition; + + /** + * removes whitespace according to xml spec + */ + [[nodiscard]] std::string_view trim_left(std::string_view v); + [[nodiscard]] bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); + [[nodiscard]] bool iri_reserved(std::string_view uri, std::string_view local_name); +} // namespace rdf4cpp::parser + +#endif //RDF4CPP_XMLPARSERSTATECOLLECTOR_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp index dbfb3e2c..773f9200 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.cpp @@ -1,7 +1,7 @@ #include -namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXML::BaseState::InheritedAttributeInfo IStreamQuadIterator::ImplXMLStateCollector::BaseState::get_inherited_attributes(XMLOutputQueue &out, std::span const attributes, Info const &info) { +namespace rdf4cpp::parser::xml_states { + BaseState::InheritedAttributeInfo BaseState::get_inherited_attributes(XMLOutputQueue &out, std::span const attributes, XMLStateInfo const &info) { InheritedAttributeInfo r{}; for (auto const &a : attributes) { if (iri_equal_pieces(base_attribute, a.uri(), a.local_name())) { @@ -15,4 +15,4 @@ namespace rdf4cpp::parser { } return r; } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp index 01650b3d..8f3d1e0d 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp @@ -1,19 +1,19 @@ #ifndef RDF4CPP_XMLPARSERBASESTATE_H #define RDF4CPP_XMLPARSERBASESTATE_H -#include +#include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * most states handle one or more of the elements in https://www.w3.org/TR/rdf11-xml/#section-Infoset-Grammar . * note that the creation of a state is done by on_start_element of the previous state. * each state holds information on base iri and language tag defined on the corresponding xml element. */ - struct IStreamQuadIterator::ImplXMLStateCollector::BaseState { // NOLINT(*-special-member-functions) + struct BaseState { // NOLINT(*-special-member-functions) virtual ~BaseState() = default; - [[nodiscard]] virtual StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) = 0; - [[nodiscard]] virtual StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) = 0; - [[nodiscard]] virtual StateTransition on_end_element(XMLOutputQueue &out, Info const &info) = 0; + [[nodiscard]] virtual StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) = 0; + [[nodiscard]] virtual StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) = 0; + [[nodiscard]] virtual StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) = 0; virtual void move_to(BaseState *b) noexcept = 0; struct InheritedAttributeInfo { @@ -30,8 +30,8 @@ namespace rdf4cpp::parser { static constexpr std::string_view base_attribute = "http://www.w3.org/XML/1998/namespacebase"; static constexpr std::string_view lang_attribute = "http://www.w3.org/XML/1998/namespacelang"; - static InheritedAttributeInfo get_inherited_attributes(XMLOutputQueue &out, std::span attributes, Info const &info); + static InheritedAttributeInfo get_inherited_attributes(XMLOutputQueue &out, std::span attributes, XMLStateInfo const &info); }; -} // namespace rdf4cpp::parser +} // namespace rdf4cpp::parser::xml_states #endif //RDF4CPP_XMLPARSERBASESTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp index 0bb4c2e2..141e6e6c 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp @@ -1,14 +1,14 @@ #include -namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::CollectionState::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { +namespace rdf4cpp::parser::xml_states { + StateTransition CollectionState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { out.add_error(ParsingError::Type::BadSyntax, "expected element, found characters", info); } return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::CollectionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, Info const &info) { + StateTransition CollectionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, XMLStateInfo const &info) { auto [transition, obj] = DescriptionState::enter(out, local_name, uri, attributes, info); if (first) { first = false; @@ -23,7 +23,7 @@ namespace rdf4cpp::parser { return transition; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::CollectionState::on_end_element(XMLOutputQueue &out, [[maybe_unused]] Info const &info) { + StateTransition CollectionState::on_end_element(XMLOutputQueue &out, [[maybe_unused]] XMLStateInfo const &info) { auto const nil = out.make_hardcoded_iri(iri_nil); if (first) { out.add_statement(subject, predicate, nil, reify); @@ -32,7 +32,7 @@ namespace rdf4cpp::parser { } return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXMLStateCollector::CollectionState::move_to(BaseState *b) noexcept { + void CollectionState::move_to(BaseState *b) noexcept { new (b) CollectionState(std::move(*this)); } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp index d23e7157..487848b0 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp @@ -1,16 +1,16 @@ #ifndef XMLPARSERCOLLECTIONSTATE_HPP #define XMLPARSERCOLLECTIONSTATE_HPP -#include +#include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt */ - struct IStreamQuadIterator::ImplXMLStateCollector::CollectionState final : BaseState { - StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; - StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; - StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; + struct CollectionState final : BaseState { + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) override; void move_to(BaseState *b) noexcept override; Node subject; @@ -27,6 +27,6 @@ namespace rdf4cpp::parser { static constexpr std::string_view iri_rest = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"; static constexpr std::string_view iri_first = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"; }; -} // namespace rdf4cpp::parser +} // namespace rdf4cpp::parser::xml_states #endif // XMLPARSERCOLLECTIONSTATE_HPP \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp index 21203cc0..86100a88 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp @@ -1,14 +1,14 @@ #include -namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { +namespace rdf4cpp::parser::xml_states { + StateTransition DescriptionState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { out.add_error(ParsingError::Type::BadSyntax, "expected predicate, found characters", info); } return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span attributes, Info const &info) { + StateTransition DescriptionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span attributes, XMLStateInfo const &info) { auto const inherited_attribute_info = get_inherited_attributes(out, attributes, info); IRI predicate; if (iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { @@ -78,13 +78,13 @@ namespace rdf4cpp::parser { } } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] Info const &info) { + StateTransition DescriptionState::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] XMLStateInfo const &info) { return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::move_to(BaseState *b) noexcept { + void DescriptionState::move_to(BaseState *b) noexcept { new (b) DescriptionState(std::move(*this)); } - std::pair IStreamQuadIterator::ImplXMLStateCollector::DescriptionState::enter(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) { + std::pair DescriptionState::enter(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) { auto const inherited_attribute_info = get_inherited_attributes(out, attributes, info); Node sub = Node::make_null(); auto check_only_one = [&sub, &out, &info]() { @@ -139,4 +139,4 @@ namespace rdf4cpp::parser { sub, }; } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp index 5bddb1c0..fbd48864 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp @@ -1,19 +1,19 @@ #ifndef RDF4CPP_XMLPARSERDESCRIPTIONSTATE_H #define RDF4CPP_XMLPARSERDESCRIPTIONSTATE_H -#include +#include #include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#nodeElementList and https://www.w3.org/TR/rdf11-xml/#nodeElement * on_start_element checks and dispatches for the different options in https://www.w3.org/TR/rdf11-xml/#propertyEltList * (https://www.w3.org/TR/rdf11-xml/#parseTypeResourcePropertyElt has no own state, instead gets handled directly by on_start_element) */ - struct IStreamQuadIterator::ImplXMLStateCollector::DescriptionState final : BaseState { - StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; - StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; - StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; + struct DescriptionState final : BaseState { + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) override; void move_to(BaseState *b) noexcept override; Node subject; @@ -23,7 +23,7 @@ namespace rdf4cpp::parser { : BaseState(i), subject(sub) { } - static std::pair enter(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info); + static std::pair enter(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info); static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; static constexpr std::string_view about_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"; @@ -31,6 +31,6 @@ namespace rdf4cpp::parser { static constexpr std::string_view node_id_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID"; static constexpr std::string_view type_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; }; -} +} // namespace rdf4cpp::parser::xml_states #endif //RDF4CPP_XMLPARSERDESCRIPTIONSTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp index 482d6d01..f020a576 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp @@ -1,22 +1,22 @@ #include -namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::EmptyElement::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { +namespace rdf4cpp::parser::xml_states { + StateTransition EmptyElement::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { out.add_error(ParsingError::Type::BadSyntax, "expected end of element, found characters", info); } return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::EmptyElement::on_start_element(XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, Info const &info) { + StateTransition EmptyElement::on_start_element(XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, XMLStateInfo const &info) { out.add_error(ParsingError::Type::BadSyntax, "expected end of element, found ???", info); return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::EmptyElement::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] Info const &info) { + StateTransition EmptyElement::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] XMLStateInfo const &info) { return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXMLStateCollector::EmptyElement::move_to(BaseState *b) noexcept { + void EmptyElement::move_to(BaseState *b) noexcept { new (b) EmptyElement(std::move(*this)); } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp index 77135624..d28418b7 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp @@ -1,22 +1,22 @@ #ifndef XMLPARSEREMPTYELEMENT_HPP #define XMLPARSEREMPTYELEMENT_HPP -#include +#include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (if attributes are present) */ - struct IStreamQuadIterator::ImplXMLStateCollector::EmptyElement final : BaseState { - StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; - StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; - StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; + struct EmptyElement final : BaseState { + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) override; void move_to(BaseState *b) noexcept override; EmptyElement() : BaseState({}) { } }; -} // namespace rdf4cpp::parser +} // namespace rdf4cpp::parser::xml_states #endif // XMLPARSEREMPTYELEMENT_HPP \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp index 20e37bc6..5f380ba8 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp @@ -1,14 +1,14 @@ #include -namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::InitialState::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { +namespace rdf4cpp::parser::xml_states { + StateTransition InitialState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { out.add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters", info); } return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::InitialState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes, Info const &info) { + StateTransition InitialState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes, XMLStateInfo const &info) { if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { return StateTransition{ std::in_place_type_t{}, @@ -19,11 +19,11 @@ namespace rdf4cpp::parser { return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::InitialState::on_end_element(XMLOutputQueue &out, Info const &info) { + StateTransition InitialState::on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) { out.add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???", info); return {}; } - void IStreamQuadIterator::ImplXMLStateCollector::InitialState::move_to(BaseState *b) noexcept { + void InitialState::move_to(BaseState *b) noexcept { new (b) InitialState(std::move(*this)); } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp index caf0d42c..913bb1d3 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp @@ -1,23 +1,23 @@ #ifndef RDF4CPP_XMLPARSERINITIALSTATE_H #define RDF4CPP_XMLPARSERINITIALSTATE_H -#include +#include #include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * initial state, checks for start of https://www.w3.org/TR/rdf11-xml/#RDF */ - struct IStreamQuadIterator::ImplXMLStateCollector::InitialState final : BaseState { - StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; - StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; - StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; + struct InitialState final : BaseState { + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) override; void move_to(BaseState *b) noexcept override; InitialState() : BaseState({}) { } }; -} +} // namespace rdf4cpp::parser::xml_states #endif //RDF4CPP_XMLPARSERINITIALSTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp index 0df77bad..dec8cdc6 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp @@ -1,8 +1,8 @@ #include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::PredicateState::on_characters([[maybe_unused]] XMLOutputQueue &out, std::string_view const chars, Info const &info) { + StateTransition PredicateState::on_characters([[maybe_unused]] XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (done) { if (!trim_left(chars).empty()) { out.add_error(ParsingError::Type::BadSyntax, "expected end of element, found literal", info); @@ -13,7 +13,7 @@ namespace rdf4cpp::parser { return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::PredicateState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, Info const &info) { + StateTransition PredicateState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, XMLStateInfo const &info) { if (!trim_left(literal).empty()) { out.add_error(ParsingError::Type::BadSyntax, "expected end of element or literal, found element", info); return {}; @@ -28,19 +28,19 @@ namespace rdf4cpp::parser { return transition; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::PredicateState::on_end_element(XMLOutputQueue &out, Info const &info) { + StateTransition PredicateState::on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) { if (!done) { Literal const lit = out.make_literal(literal, std::nullopt, std::nullopt, info); out.add_statement(subject, predicate, lit, reify); } return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXMLStateCollector::PredicateState::move_to(BaseState *b) noexcept { + void PredicateState::move_to(BaseState *b) noexcept { new (b) PredicateState(std::move(*this)); } - bool IStreamQuadIterator::ImplXMLStateCollector::PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { + bool PredicateState::iri_reserved_predicate(std::string_view const uri, std::string_view const local_name) { return iri_reserved(uri, local_name) || iri_equal_pieces(DescriptionState::start_element, uri, local_name) || iri_equal_pieces(list_start_element, uri, local_name); } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp index c9e8d8e1..c6553683 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp @@ -1,19 +1,19 @@ #ifndef RDF4CPP_XMLPARSERPREDICATESTATE_H #define RDF4CPP_XMLPARSERPREDICATESTATE_H -#include +#include #include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#resourcePropertyElt (nested nodeElement / DescriptionState * and https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (literal with no datatype attribute) * and https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (with no attributes (empty literal)) */ - struct IStreamQuadIterator::ImplXMLStateCollector::PredicateState : BaseState { - StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; - StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; - StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; + struct PredicateState : BaseState { + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) override; void move_to(BaseState *b) noexcept override; Node subject; @@ -35,6 +35,6 @@ namespace rdf4cpp::parser { static bool iri_reserved_predicate(std::string_view uri, std::string_view local_name); }; -} +} // namespace rdf4cpp::parser::xml_states #endif //RDF4CPP_XMLPARSERPREDICATESTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp index f745a2e9..9bf6a6cf 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp @@ -1,22 +1,22 @@ #include -namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::RDFState::on_characters(XMLOutputQueue &out, std::string_view const chars, Info const &info) { +namespace rdf4cpp::parser::xml_states { + StateTransition RDFState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { out.add_error(ParsingError::Type::BadSyntax, "expected Description, found characters", info); } return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::RDFState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, Info const &info) { + StateTransition RDFState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, XMLStateInfo const &info) { auto [trans, _] = DescriptionState::enter(out, local_name, uri, attributes, info); return trans; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::RDFState::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] Info const &info) { + StateTransition RDFState::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] XMLStateInfo const &info) { return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXMLStateCollector::RDFState::move_to(BaseState *b) noexcept { + void RDFState::move_to(BaseState *b) noexcept { new (b) RDFState(std::move(*this)); } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp index b05518e1..2730b109 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp @@ -1,23 +1,23 @@ #ifndef RDF4CPP_XMLPARSERRDFSTATE_H #define RDF4CPP_XMLPARSERRDFSTATE_H -#include +#include #include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#RDF */ - struct IStreamQuadIterator::ImplXMLStateCollector::RDFState final : BaseState { - StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; - StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; - StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; + struct RDFState final : BaseState { + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) override; void move_to(BaseState *b) noexcept override; static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF"; using BaseState::BaseState; }; -} +} // namespace rdf4cpp::parser::xml_states #endif //RDF4CPP_XMLPARSERRDFSTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp index 0cc97682..234900f3 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp @@ -1,19 +1,19 @@ #include -namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState::on_start_element(XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, Info const &info) { +namespace rdf4cpp::parser::xml_states { + StateTransition TypedLiteralPredicateState::on_start_element(XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, XMLStateInfo const &info) { out.add_error(ParsingError::Type::BadSyntax, "expected literal, found element", info); return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState::on_end_element(XMLOutputQueue &out, Info const &info) { + StateTransition TypedLiteralPredicateState::on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) { if (!datatype.null()) { Literal const lit = out.make_literal(literal, datatype, std::nullopt, info); out.add_statement(subject, predicate, lit, reify); } return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState::move_to(BaseState *b) noexcept { + void TypedLiteralPredicateState::move_to(BaseState *b) noexcept { new (b) TypedLiteralPredicateState(std::move(*this)); } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp index 66de9a2a..787ecf75 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp @@ -1,16 +1,16 @@ #ifndef RDF4CPP_XMLPARSERTYPEDLITERALPREDICATESTATE_H #define RDF4CPP_XMLPARSERTYPEDLITERALPREDICATESTATE_H -#include +#include #include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (with datatype attribute) */ - struct IStreamQuadIterator::ImplXMLStateCollector::TypedLiteralPredicateState final : PredicateState { - StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; - StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; + struct TypedLiteralPredicateState final : PredicateState { + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) override; void move_to(BaseState *b) noexcept override; IRI datatype; @@ -21,6 +21,6 @@ namespace rdf4cpp::parser { static constexpr std::string_view datatype_attrib = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype"; }; -} +} // namespace rdf4cpp::parser::xml_states #endif //RDF4CPP_XMLPARSERTYPEDLITERALPREDICATESTATE_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp index 7cb4df12..78bdd5d0 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp @@ -1,18 +1,18 @@ #include -namespace rdf4cpp::parser { - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::on_characters([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] std::string_view chars, Info const &info) { +namespace rdf4cpp::parser::xml_states { + StateTransition XMLLiteralState::on_characters([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] std::string_view chars, XMLStateInfo const &info) { source_input(info); return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::on_start_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, Info const &info) { + StateTransition XMLLiteralState::on_start_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, XMLStateInfo const &info) { ++depth; source_input(info); return {}; } - IStreamQuadIterator::ImplXMLStateCollector::StateTransition IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::on_end_element(XMLOutputQueue &out, Info const &info) { + StateTransition XMLLiteralState::on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) { if (depth > 0) { --depth; source_input(info); @@ -32,11 +32,11 @@ namespace rdf4cpp::parser { out.add_statement(subject, predicate, lit, reify); return StateTransition{std::in_place_type_t{}}; } - void IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::move_to(BaseState *b) noexcept { + void XMLLiteralState::move_to(BaseState *b) noexcept { new (b) XMLLiteralState(std::move(*this)); } - void IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState::source_input(Info const &info) { + void XMLLiteralState::source_input(XMLStateInfo const &info) { int const off = info.source_offset; std::string_view const sv = info.source; if (literal.empty()) { @@ -48,4 +48,4 @@ namespace rdf4cpp::parser { } last_offset = static_cast(off) + last_size; } -} // namespace rdf4cpp::parser \ No newline at end of file +} // namespace rdf4cpp::parser::xml_states \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp index 0bdeb5fe..3405cf93 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp @@ -1,18 +1,18 @@ #ifndef RDF4CPP_XMLPARSERXMLLITERALESTATE_H #define RDF4CPP_XMLPARSERXMLLITERALESTATE_H -#include +#include #include -namespace rdf4cpp::parser { +namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#parseTypeLiteralPropertyElt * (and https://www.w3.org/TR/rdf11-xml/#parseTypeOtherPropertyElt) */ - struct IStreamQuadIterator::ImplXMLStateCollector::XMLLiteralState final : PredicateState { - StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, Info const &info) override; - StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, Info const &info) override; - StateTransition on_end_element(XMLOutputQueue &out, Info const &info) override; + struct XMLLiteralState final : PredicateState { + StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; + StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; + StateTransition on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) override; void move_to(BaseState *b) noexcept override; size_t depth = 0; @@ -20,12 +20,12 @@ namespace rdf4cpp::parser { size_t last_offset = 0; size_t last_size = 0; - void source_input(Info const &info); + void source_input(XMLStateInfo const &info); - XMLLiteralState(InheritedAttributeInfo const &i, Node sub, IRI predicate, IRI reify, Info const &info) : PredicateState(i, sub, predicate, reify) { + XMLLiteralState(InheritedAttributeInfo const &i, Node sub, IRI predicate, IRI reify, XMLStateInfo const &info) : PredicateState(i, sub, predicate, reify) { source_input(info); } }; -} +} // namespace rdf4cpp::parser::xml_states #endif //RDF4CPP_XMLPARSERXMLLITERALESTATE_H From 5682b8c42a4dee8c9c91fc51b058ef5cad0f0ea0 Mon Sep 17 00:00:00 2001 From: mcb Date: Thu, 11 Dec 2025 14:35:55 +0100 Subject: [PATCH 35/42] prepare for tests --- tests/parser/tests_XMLParser.cpp | 152 ++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 3d1f9f99..3e202eec 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -886,4 +886,154 @@ TEST_CASE("rdf xml negative tests") { ++xml_iter; } REQUIRE(expected_msg.empty()); -} \ No newline at end of file +} + +void xml_test_positive(std::string xml_str, std::string nt_str, std::string_view base_iri) { + CAPTURE(base_iri); + + IStreamQuadIterator::state_type state{}; + CHECK(state.iri_factory.set_base(base_iri) == IRIFactoryError::Ok); + std::stringstream xml{std::move(xml_str)}; + IStreamQuadIterator xml_iter{xml, ParsingFlag::RdfXml, &state}; + std::vector xml_results; + + std::stringstream nt{std::move(nt_str)}; + IStreamQuadIterator nt_iter{nt, ParsingFlag::NTriples}; + std::vector nt_results; + + static constexpr auto read_iter_to = [](IStreamQuadIterator& i, std::vector& r) { + while (i != std::default_sentinel) { + if (!i->has_value()) { + FAIL(i->error().message); + } + r.emplace_back(i->value()); + ++i; + } + }; + read_iter_to(xml_iter, xml_results); + read_iter_to(nt_iter, nt_results); + + REQUIRE(xml_results.size() == nt_results.size()); + + static constexpr auto num_blanks = [](const query::QuadPattern& p) { + size_t n = 0; + if (p.subject().is_blank_node()) { + ++n; + } + if (p.predicate().is_blank_node()) { + ++n; + } + if (p.object().is_blank_node()) { + ++n; + } + return n; + }; + static constexpr auto sort = [](std::vector& v) { + std::sort(v.begin(), v.end(), [](const query::QuadPattern& a, const query::QuadPattern& b) { + auto a_bl = num_blanks(a); + auto b_bl = num_blanks(b); + if (a_bl != b_bl) { + return std::less{}(a_bl, b_bl); + } + if (a.subject() != b.subject()) { + return std::less{}(a.subject(), b.subject()); + } + if (a.predicate() != b.predicate()) { + return std::less{}(a.predicate(), b.predicate()); + } + return std::less{}(a.object(), b.object()); + }); + }; + sort(xml_results); + sort(nt_results); + + std::map bn_map{}; + auto check = [&bn_map](Node xml, Node nt) { + if (nt.is_blank_node() && xml.is_blank_node()) { + auto i = bn_map.find(nt.as_blank_node()); + if (i != bn_map.end()) { + CHECK(xml.as_blank_node() == i->second.as_blank_node()); + } + else { + bn_map[nt.as_blank_node()] = xml.as_blank_node(); + } + } + else { + CHECK(xml == nt); + } + }; + + for (size_t i = 0; i < nt_results.size(); ++i) { + check(xml_results.at(i).subject(), nt_results.at(i).subject()); + check(xml_results.at(i).predicate(), nt_results.at(i).predicate()); + check(xml_results.at(i).object(), nt_results.at(i).object()); + } +} + +void xml_test_negative(std::string xml_str, std::string_view base_iri) { + CAPTURE(base_iri); + + + std::stringstream xml{std::move(xml_str)}; + IStreamQuadIterator xml_iter{xml, ParsingFlag::RdfXml}; + + bool had_error = false; + while (xml_iter != std::default_sentinel) { + if (xml_iter->has_value()) { + ++xml_iter; + continue; + } + had_error = true; + ++xml_iter; + } + CHECK(had_error == true); +} + +TEST_CASE("test xml tests") { + // TODO replace with tests by Nikos + // basic functionality + xml_test_positive(R"( + + + +xxx + + +)", R"( "xxx" .)", "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/amp-in-url/test001.rdf"); + + xml_test_negative(R"(" + " + + + +)", "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error001.rdf"); + + // check reordering & base + xml_test_positive(R"( + + + + 1 + 2 + + + + + +)", R"(_:bar . +_:bar "1" . + . + _:bar . + . + "1" . +_:bar "2"^^ . +_:bar _:res . +_:res . +_:bar _:res2 . + . + _:bar . + . + _:res2 . +_:res2 "foobar" .)", "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test004.rdf"); +} From 286815adcca832266db4a9ae7f93c2130dbbc4f9 Mon Sep 17 00:00:00 2001 From: Liss Heidrich <31625940+liss-h@users.noreply.github.com> Date: Wed, 17 Dec 2025 13:59:38 +0100 Subject: [PATCH 36/42] remove self-referentialness of struct and minor cleanup --- private/rdf4cpp/parser/XMLParser.cpp | 46 +++++---------- private/rdf4cpp/parser/XMLParser.hpp | 66 ++++++++++------------ src/rdf4cpp/parser/IStreamQuadIterator.hpp | 2 +- 3 files changed, 47 insertions(+), 67 deletions(-) diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 3cb9d87b..1bdccf3f 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -15,49 +15,36 @@ namespace rdf4cpp::parser { }; r.characters = [](void *th, xmlChar const *e, int const len) { auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_characters(t->output_, from_xml_char(e, len), t->make_info())); + t->handle_state_transition(t->current_state().on_characters(t->output_, from_xml_char(e, len), t->make_info())); }; r.startElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), + t->handle_state_transition(t->current_state().on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), std::span{reinterpret_cast(attributes), static_cast(n_attributes)}, t->make_info())); }; r.endElementNs = [](void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_end_element(t->output_, t->make_info())); + t->handle_state_transition(t->current_state().on_end_element(t->output_, t->make_info())); }; r.warning = on_error; r.error = on_error; return r; } + void IStreamQuadIterator::ImplXML::handle_state_transition(StateTransition transition) { - std::visit([&](T &&s) { - if constexpr (std::same_as) { - return; - } else if constexpr (std::same_as) { - pop_state(); - } else { - state_stack_.emplace_back(std::in_place_type_t{}, std::forward(s)); - update_current_state(); + dice::template_library::match(std::move(transition.modify_state), + [](NoStateChange) { + // noop + }, + [this](PopState) { + state_stack_.pop_back(); + }, + [this](S &&new_state) { + state_stack_.emplace_back(std::in_place_type, std::forward(new_state)); } - }, - std::move(transition.modify_state)); - } - - void IStreamQuadIterator::ImplXML::update_current_state() { - if (state_stack_.empty()) { - current_state_ = nullptr; - return; - } - current_state_ = &state_stack_.back().get(); - } - - void IStreamQuadIterator::ImplXML::pop_state() { - assert(!state_stack_.empty()); - state_stack_.pop_back(); - update_current_state(); + ); } // implemented here, to have access to states @@ -140,11 +127,8 @@ namespace rdf4cpp::parser { xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); state_stack_.reserve(10); state_stack_.emplace_back(std::in_place_type_t{}); - update_current_state(); - current_state_->base = output_.current_base_iri(); - } - IStreamQuadIterator::ImplXML::~ImplXML() { // NOLINT(*-use-equals-default) + current_state().base = output_.current_base_iri(); } std::optional IStreamQuadIterator::ImplXML::next() { diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp index 2cd058d9..f4d585b8 100644 --- a/private/rdf4cpp/parser/XMLParser.hpp +++ b/private/rdf4cpp/parser/XMLParser.hpp @@ -1,41 +1,29 @@ #ifndef RDF4CPP_XMLPARSER_H #define RDF4CPP_XMLPARSER_H -#include - #include - #include - #include #include - -#include - -#include - -#include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include +#include +#include + +#include + +#include +#include + namespace rdf4cpp::parser { struct IStreamQuadIterator::ImplXML final : Impl { private: @@ -54,18 +42,25 @@ namespace rdf4cpp::parser { EOFFunc eof_func_; XMLOutputQueue output_; - xml_states::BaseState *current_state_ = nullptr; - std::vector> - state_stack_; + using State = dice::template_library::inplace_polymorphic; + + std::vector state_stack_; // Note: we use a vector because std::stack does not have .reserve() + + [[nodiscard]] xml_states::BaseState const ¤t_state() const noexcept { + return *state_stack_.back(); + } + + [[nodiscard]] xml_states::BaseState ¤t_state() noexcept { + return *state_stack_.back(); + } static xmlSAXHandler make_sax_handler(); void handle_state_transition(StateTransition transition); - void update_current_state(); - void pop_state(); static void on_error(void *th, char const *msg, ...); @@ -73,12 +68,12 @@ namespace rdf4cpp::parser { public: ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); - ~ImplXML() override; ImplXML(ImplXML const &) = delete; ImplXML &operator=(ImplXML const &) = delete; ImplXML(ImplXML &&) = delete; ImplXML &operator=(ImplXML &&) = delete; + ~ImplXML() override = default; [[nodiscard]] std::optional next() override; @@ -87,17 +82,18 @@ namespace rdf4cpp::parser { }; struct StateTransition { - using ModifyStateStack = std::variant; + using ModifyStateStack = std::variant; ModifyStateStack modify_state; - template - explicit StateTransition(T &&...a) : modify_state(std::forward(a)...) { + template + explicit StateTransition(Args &&...args) : modify_state(std::forward(args)...) { } - StateTransition() : StateTransition(std::in_place_type_t{}) { + StateTransition() noexcept : StateTransition(std::in_place_type_t{}) { } }; } // namespace rdf4cpp::parser diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.hpp b/src/rdf4cpp/parser/IStreamQuadIterator.hpp index d1f9c13d..47148b07 100644 --- a/src/rdf4cpp/parser/IStreamQuadIterator.hpp +++ b/src/rdf4cpp/parser/IStreamQuadIterator.hpp @@ -101,8 +101,8 @@ struct IStreamQuadIterator { Impl &operator=(Impl const &) = delete; Impl &operator=(Impl &&) = delete; }; + struct ImplSerd; - struct ImplXMLStateCollector; struct ImplXML; std::unique_ptr impl; From 88d93914955d128deefe34bd38226f065c677b21 Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 9 Jan 2026 14:04:57 +0100 Subject: [PATCH 37/42] review --- private/rdf4cpp/parser/XMLParser.cpp | 60 ++++++++++--------- private/rdf4cpp/parser/XMLParser.hpp | 21 ++----- .../parser/XMLParserStateTransition.hpp | 39 ++++++++++++ .../XMLStates/XMLParserCollectionState.cpp | 2 + .../XMLStates/XMLParserCollectionState.hpp | 9 +++ .../XMLStates/XMLParserDescriptionState.cpp | 5 +- .../XMLStates/XMLParserDescriptionState.hpp | 14 +++++ .../XMLStates/XMLParserEmptyElement.cpp | 2 + .../XMLStates/XMLParserEmptyElement.hpp | 5 ++ .../XMLStates/XMLParserInitialState.cpp | 2 + .../XMLStates/XMLParserPredicateState.cpp | 2 + .../XMLStates/XMLParserPredicateState.hpp | 11 ++++ .../parser/XMLStates/XMLParserRDFState.cpp | 2 + .../parser/XMLStates/XMLParserRDFState.hpp | 5 ++ .../XMLParserTypedLiteralPredicateState.cpp | 2 + .../XMLParserTypedLiteralPredicateState.hpp | 5 ++ .../XMLStates/XMLParserXMLLiteralState.cpp | 2 + .../XMLStates/XMLParserXMLLiteralState.hpp | 5 ++ 18 files changed, 149 insertions(+), 44 deletions(-) create mode 100644 private/rdf4cpp/parser/XMLParserStateTransition.hpp diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 3cb9d87b..e9583199 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -1,5 +1,7 @@ #include +#include + #include namespace rdf4cpp::parser { @@ -7,27 +9,11 @@ namespace rdf4cpp::parser { xmlSAXHandler r{}; std::memset(&r, 0, sizeof(xmlSAXHandler)); r.initialized = XML_SAX2_MAGIC; - r.getParameterEntity = [](void *, xmlChar const *e) { - return xmlGetPredefinedEntity(e); - }; - r.getEntity = [](void *, xmlChar const *e) { - return xmlGetPredefinedEntity(e); - }; - r.characters = [](void *th, xmlChar const *e, int const len) { - auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_characters(t->output_, from_xml_char(e, len), t->make_info())); - }; - r.startElementNs = [](void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, - [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, - int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { - auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), - std::span{reinterpret_cast(attributes), static_cast(n_attributes)}, t->make_info())); - }; - r.endElementNs = [](void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { - auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_end_element(t->output_, t->make_info())); - }; + r.getParameterEntity = get_entity; + r.getEntity = get_entity; + r.characters = on_characters; + r.startElementNs = on_start_element; + r.endElementNs = on_end_element; r.warning = on_error; r.error = on_error; return r; @@ -96,11 +82,29 @@ namespace rdf4cpp::parser { t->output_.add_error(ParsingError::Type::BadSyntax, std::move(out), t->make_info()); va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) } + xmlEntity *IStreamQuadIterator::ImplXML::get_entity(void *, xmlChar const *e) { + return xmlGetPredefinedEntity(e); + } + void IStreamQuadIterator::ImplXML::on_characters(void *th, xmlChar const *e, int const len) { + auto *t = static_cast(th); + t->handle_state_transition(t->current_state_->on_characters(t->output_, from_xml_char(e, len), t->make_info())); + } + void IStreamQuadIterator::ImplXML::on_start_element(void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, + [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, + int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { + auto *t = static_cast(th); + t->handle_state_transition(t->current_state_->on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), + std::span{reinterpret_cast(attributes), static_cast(n_attributes)}, t->make_info())); + } + void IStreamQuadIterator::ImplXML::on_end_element(void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { + auto *t = static_cast(th); + t->handle_state_transition(t->current_state_->on_end_element(t->output_, t->make_info())); + } XMLStateInfo IStreamQuadIterator::ImplXML::make_info() const { std::string_view base = ""; - for (auto const &s : state_stack_ | std::ranges::views::reverse) { - std::string_view const v = s.get().base; + for (auto const &s : state_stack_ | std::views::reverse) { + std::string_view const v = s->base; if (!v.empty()) { base = v; break; @@ -108,8 +112,8 @@ namespace rdf4cpp::parser { } std::string_view lang_tag = ""; - for (auto const &s : state_stack_ | std::ranges::views::reverse) { - std::string_view const v = s.get().lang_tag; + for (auto const &s : state_stack_ | std::views::reverse) { + std::string_view const v = s->lang_tag; if (!v.empty()) { lang_tag = v; break; @@ -123,8 +127,8 @@ namespace rdf4cpp::parser { std::string_view const source{reinterpret_cast(data), static_cast(size)}; return XMLStateInfo{ - static_cast(xmlSAX2GetLineNumber(context_.get())), - static_cast(xmlSAX2GetColumnNumber(context_.get())), + current_line(), + current_column(), base, lang_tag, source, @@ -148,7 +152,7 @@ namespace rdf4cpp::parser { } std::optional IStreamQuadIterator::ImplXML::next() { - std::array buffer; // NOLINT(*-pro-type-member-init) + std::array buffer; // NOLINT(*-pro-type-member-init) while (output_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) { auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_); xmlParseChunk(context_.get(), buffer.data(), static_cast(read), eof_func_(reader_obj_) != 0); diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp index 2cd058d9..b1a8e537 100644 --- a/private/rdf4cpp/parser/XMLParser.hpp +++ b/private/rdf4cpp/parser/XMLParser.hpp @@ -68,6 +68,12 @@ namespace rdf4cpp::parser { void pop_state(); static void on_error(void *th, char const *msg, ...); + static xmlEntity *get_entity(void *th, xmlChar const *e); + static void on_characters(void *th, xmlChar const *e, int len); + static void on_start_element(void *th, xmlChar const *local_name, xmlChar const *prefix, xmlChar const *uri, + int n_namespaces, xmlChar const **namespaces, + int n_attributes, int n_defaulted, xmlChar const **attributes); + static void on_end_element(void *th, xmlChar const *local_name, xmlChar const *prefix, xmlChar const *uri); [[nodiscard]] XMLStateInfo make_info() const; @@ -85,21 +91,6 @@ namespace rdf4cpp::parser { [[nodiscard]] uint64_t current_line() const noexcept override; [[nodiscard]] uint64_t current_column() const noexcept override; }; - - struct StateTransition { - using ModifyStateStack = std::variant; - - ModifyStateStack modify_state; - - template - explicit StateTransition(T &&...a) : modify_state(std::forward(a)...) { - } - - StateTransition() : StateTransition(std::in_place_type_t{}) { - } - }; } // namespace rdf4cpp::parser #endif //RDF4CPP_XMLPARSER_H diff --git a/private/rdf4cpp/parser/XMLParserStateTransition.hpp b/private/rdf4cpp/parser/XMLParserStateTransition.hpp new file mode 100644 index 00000000..1fddc20e --- /dev/null +++ b/private/rdf4cpp/parser/XMLParserStateTransition.hpp @@ -0,0 +1,39 @@ +#ifndef RDF4CPP_XMLPARSERSTATETRANSITION_H +#define RDF4CPP_XMLPARSERSTATETRANSITION_H + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +namespace rdf4cpp::parser { + struct StateTransition { + using ModifyStateStack = std::variant; + + ModifyStateStack modify_state; + + template + explicit StateTransition(T &&...a) : modify_state(std::forward(a)...) { + } + + StateTransition() : StateTransition(std::in_place_type_t{}) { + } + }; +} + +#endif //RDF4CPP_XMLPARSERSTATETRANSITION_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp index 141e6e6c..2e77a2a4 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp @@ -1,5 +1,7 @@ #include +#include + namespace rdf4cpp::parser::xml_states { StateTransition CollectionState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp index 487848b0..3ea1eb93 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp @@ -6,6 +6,15 @@ namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt + * + * example: + * + * + * ... + * ... + * ... + * + * */ struct CollectionState final : BaseState { StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp index 86100a88..fbc79a60 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp @@ -1,5 +1,7 @@ #include +#include + namespace rdf4cpp::parser::xml_states { StateTransition DescriptionState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { @@ -64,7 +66,8 @@ namespace rdf4cpp::parser::xml_states { return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, *datatype); } else if (sub.has_value()) { out.add_statement(subject, predicate, *sub, reify); - return StateTransition(std::in_place_type_t{}); + return StateTransition(std::in_place_type_t{}); // predicate is expected to be empty, object defined as attribute + // example: https://www.w3.org/2013/RDFXMLTests/rdfms-empty-property-elements/test013.rdf } else if (parse_resource) { Node const obj = out.make_bn(std::nullopt, info); out.add_statement(subject, predicate, obj, reify); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp index fbd48864..17318432 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp @@ -9,6 +9,11 @@ namespace rdf4cpp::parser::xml_states { * state for https://www.w3.org/TR/rdf11-xml/#nodeElementList and https://www.w3.org/TR/rdf11-xml/#nodeElement * on_start_element checks and dispatches for the different options in https://www.w3.org/TR/rdf11-xml/#propertyEltList * (https://www.w3.org/TR/rdf11-xml/#parseTypeResourcePropertyElt has no own state, instead gets handled directly by on_start_element) + * + * example: + * + * ... + * */ struct DescriptionState final : BaseState { StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; @@ -23,6 +28,15 @@ namespace rdf4cpp::parser::xml_states { : BaseState(i), subject(sub) { } + /** + * enters a description state + * @param out + * @param local_name + * @param uri + * @param attributes + * @param info + * @return transition & the node this state represents, to be used as object in parent states + */ static std::pair enter(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info); static constexpr std::string_view start_element = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description"; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp index f020a576..26a53783 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp @@ -1,5 +1,7 @@ #include +#include + namespace rdf4cpp::parser::xml_states { StateTransition EmptyElement::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp index d28418b7..4cfd62ce 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp @@ -6,6 +6,11 @@ namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (if attributes are present) + * + * example: + * + * + * */ struct EmptyElement final : BaseState { StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp index 5f380ba8..30eab4ce 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp @@ -1,5 +1,7 @@ #include +#include + namespace rdf4cpp::parser::xml_states { StateTransition InitialState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp index dec8cdc6..ad7a5dc8 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp @@ -1,5 +1,7 @@ #include +#include + namespace rdf4cpp::parser::xml_states { StateTransition PredicateState::on_characters([[maybe_unused]] XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp index c6553683..a600d1b7 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp @@ -9,6 +9,17 @@ namespace rdf4cpp::parser::xml_states { * state for https://www.w3.org/TR/rdf11-xml/#resourcePropertyElt (nested nodeElement / DescriptionState * and https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (literal with no datatype attribute) * and https://www.w3.org/TR/rdf11-xml/#emptyPropertyElt (with no attributes (empty literal)) + * + * example: + * + * + * + * ... + * + * + * foo + * + * */ struct PredicateState : BaseState { StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp index 9bf6a6cf..b44f5691 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp @@ -1,5 +1,7 @@ #include +#include + namespace rdf4cpp::parser::xml_states { StateTransition RDFState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp index 2730b109..4c69e2ae 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp @@ -7,6 +7,11 @@ namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#RDF + * + * example: + * + * ... + * */ struct RDFState final : BaseState { StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp index 234900f3..62fb99dc 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp @@ -1,5 +1,7 @@ #include +#include + namespace rdf4cpp::parser::xml_states { StateTransition TypedLiteralPredicateState::on_start_element(XMLOutputQueue &out, [[maybe_unused]] std::string_view local_name, [[maybe_unused]] std::string_view uri, [[maybe_unused]] std::span attributes, XMLStateInfo const &info) { out.add_error(ParsingError::Type::BadSyntax, "expected literal, found element", info); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp index 787ecf75..9ab29787 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp @@ -7,6 +7,11 @@ namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#literalPropertyElt (with datatype attribute) + * + * example: + * + * 10 + * */ struct TypedLiteralPredicateState final : PredicateState { StateTransition on_start_element(XMLOutputQueue &out, std::string_view local_name, std::string_view uri, std::span attributes, XMLStateInfo const &info) override; diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp index 78bdd5d0..871209e9 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp @@ -1,5 +1,7 @@ #include +#include + namespace rdf4cpp::parser::xml_states { StateTransition XMLLiteralState::on_characters([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] std::string_view chars, XMLStateInfo const &info) { source_input(info); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp index 3405cf93..7ffbdfff 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp @@ -8,6 +8,11 @@ namespace rdf4cpp::parser::xml_states { /** * state for https://www.w3.org/TR/rdf11-xml/#parseTypeLiteralPropertyElt * (and https://www.w3.org/TR/rdf11-xml/#parseTypeOtherPropertyElt) + * + * example: + * + * + * */ struct XMLLiteralState final : PredicateState { StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override; From e23492e4cacbdb9666e93e4a514809851fa33317 Mon Sep 17 00:00:00 2001 From: mcb Date: Fri, 9 Jan 2026 14:11:04 +0100 Subject: [PATCH 38/42] fix merge --- private/rdf4cpp/parser/XMLParser.cpp | 6 +++--- private/rdf4cpp/parser/XMLParserStateTransition.hpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 2084dbfb..d55d3522 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -74,18 +74,18 @@ namespace rdf4cpp::parser { } void IStreamQuadIterator::ImplXML::on_characters(void *th, xmlChar const *e, int const len) { auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_characters(t->output_, from_xml_char(e, len), t->make_info())); + t->handle_state_transition(t->current_state().on_characters(t->output_, from_xml_char(e, len), t->make_info())); } void IStreamQuadIterator::ImplXML::on_start_element(void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), + t->handle_state_transition(t->current_state().on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), std::span{reinterpret_cast(attributes), static_cast(n_attributes)}, t->make_info())); } void IStreamQuadIterator::ImplXML::on_end_element(void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { auto *t = static_cast(th); - t->handle_state_transition(t->current_state_->on_end_element(t->output_, t->make_info())); + t->handle_state_transition(t->current_state().on_end_element(t->output_, t->make_info())); } XMLStateInfo IStreamQuadIterator::ImplXML::make_info() const { diff --git a/private/rdf4cpp/parser/XMLParserStateTransition.hpp b/private/rdf4cpp/parser/XMLParserStateTransition.hpp index 1fddc20e..11fecca4 100644 --- a/private/rdf4cpp/parser/XMLParserStateTransition.hpp +++ b/private/rdf4cpp/parser/XMLParserStateTransition.hpp @@ -27,11 +27,11 @@ namespace rdf4cpp::parser { ModifyStateStack modify_state; - template - explicit StateTransition(T &&...a) : modify_state(std::forward(a)...) { + template + explicit StateTransition(Args &&...args) : modify_state(std::forward(args)...) { } - StateTransition() : StateTransition(std::in_place_type_t{}) { + StateTransition() noexcept : StateTransition(std::in_place_type_t{}) { } }; } From 71638eaa924b8fd10d27e848d38173d54fc5c32d Mon Sep 17 00:00:00 2001 From: Nikolaos Karalis Date: Wed, 21 Jan 2026 10:25:37 +0100 Subject: [PATCH 39/42] tests from rdf-tests repo --- tests/CMakeLists.txt | 2 + tests/parser/tests_XMLParser.cpp | 239 +++++++++++++++++++++++++------ 2 files changed, 195 insertions(+), 46 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bc5186ca..2c38a3cd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,6 +2,7 @@ set(CMAKE_CXX_STANDARD 20) find_package(doctest REQUIRED) find_package(nanobench REQUIRED) +find_package(CURL REQUIRED) # add the executable for all tests add_executable(tests_Variable query/tests_Variable.cpp) @@ -364,6 +365,7 @@ add_executable(tests_XMLParser parser/tests_XMLParser.cpp) target_link_libraries(tests_XMLParser doctest::doctest rdf4cpp + CURL::libcurl ) add_test(NAME tests_XMLParser COMMAND tests_XMLParser) diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 3e202eec..15dcf204 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -1,10 +1,10 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include - #include #include - +#include +#include using namespace rdf4cpp; using namespace rdf4cpp::parser; @@ -989,51 +989,198 @@ void xml_test_negative(std::string xml_str, std::string_view base_iri) { CHECK(had_error == true); } -TEST_CASE("test xml tests") { - // TODO replace with tests by Nikos - // basic functionality - xml_test_positive(R"( - - - -xxx - - -)", R"( "xxx" .)", "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/amp-in-url/test001.rdf"); - - xml_test_negative(R"(" - " - - -)", "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error001.rdf"); +// adopted from https://stackoverflow.com/questions/9786150/save-curl-content-result-into-a-string-in-c/9786295#9786295 +static size_t write_callback(void const *contents, size_t size, size_t nmemb, void *userp) { + static_cast(userp)->append(static_cast(contents), size * nmemb); + return size * nmemb; +} - // check reordering & base - xml_test_positive(R"( - +std::string remote_test_file_to_str(std::string const &file_name) { + CURL *curl = nullptr; + CURLcode curl_res; + auto const url = std::format("https://raw.githubusercontent.com/w3c/rdf-tests/refs/heads/main/rdf/rdf11/rdf-xml/{}", file_name); + std::string file_contents_as_str; + curl = curl_easy_init(); + if(curl) { + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file_contents_as_str); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); // for https + curl_res = curl_easy_perform(curl); + curl_easy_cleanup(curl); + } + REQUIRE_EQ(curl_res, CURLE_OK); + return file_contents_as_str; +} - - 1 - 2 - - - - - -)", R"(_:bar . -_:bar "1" . - . - _:bar . - . - "1" . -_:bar "2"^^ . -_:bar _:res . -_:res . -_:bar _:res2 . - . - _:bar . - . - _:res2 . -_:res2 "foobar" .)", "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test004.rdf"); +TEST_CASE("test cases from rdf-tests") { + // positive tests + xml_test_positive(remote_test_file_to_str("amp-in-url/test001.rdf"), remote_test_file_to_str("amp-in-url/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/amp-in-url/test001.rdf"); + xml_test_positive(remote_test_file_to_str("datatypes/test001.rdf"), remote_test_file_to_str("datatypes/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/datatypes/test001.rdf"); + // xml_test_positive(remote_test_file_to_str("datatypes/test002.rdf"), remote_test_file_to_str("datatypes/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/datatypes/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-charmod-literals/test001.rdf"), remote_test_file_to_str("rdf-charmod-literals/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-charmod-literals/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-charmod-uris/test001.rdf"), remote_test_file_to_str("rdf-charmod-uris/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-charmod-uris/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-charmod-uris/test002.rdf"), remote_test_file_to_str("rdf-charmod-uris/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-charmod-uris/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test001.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test002.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test003.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test003.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test004.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test004.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test006.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test006.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test006.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test007.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test007.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test007.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test008.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test008.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test008.rdf"); + // xml_test_positive(remote_test_file_to_str("rdf-element-not-mandatory/test001.rdf"), remote_test_file_to_str("rdf-element-not-mandatory/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-element-not-mandatory/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-node-element/test001.rdf"), remote_test_file_to_str("rdf-node-element/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-node-element/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0001.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0001.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0003.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0003.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0004.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0004.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0005.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0005.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0005.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0006.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0006.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0006.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0009.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0009.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0009.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0010.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0010.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0010.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0011.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0011.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0011.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0012.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0012.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0012.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0013.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0013.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0013.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0014.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0014.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0014.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-difference-between-ID-and-about/test1.rdf"), remote_test_file_to_str("rdfms-difference-between-ID-and-about/test1.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-difference-between-ID-and-about/test1.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-difference-between-ID-and-about/test2.rdf"), remote_test_file_to_str("rdfms-difference-between-ID-and-about/test2.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-difference-between-ID-and-about/test2.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-difference-between-ID-and-about/test3.rdf"), remote_test_file_to_str("rdfms-difference-between-ID-and-about/test3.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-difference-between-ID-and-about/test3.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-duplicate-member-props/test001.rdf"), remote_test_file_to_str("rdfms-duplicate-member-props/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-duplicate-member-props/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test001.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test002.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test004.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test004.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test005.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test005.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test005.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test006.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test006.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test006.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test007.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test007.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test007.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test008.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test008.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test008.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test010.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test010.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test010.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test011.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test011.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test011.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test012.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test012.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test012.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test013.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test013.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test013.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test014.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test014.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test014.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test015.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test015.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test015.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test016.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test016.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test016.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-empty-property-elements/test017.rdf"), remote_test_file_to_str("rdfms-empty-property-elements/test017.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/test017.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-identity-anon-resources/test001.rdf"), remote_test_file_to_str("rdfms-identity-anon-resources/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-identity-anon-resources/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-identity-anon-resources/test002.rdf"), remote_test_file_to_str("rdfms-identity-anon-resources/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-identity-anon-resources/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-identity-anon-resources/test003.rdf"), remote_test_file_to_str("rdfms-identity-anon-resources/test003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-identity-anon-resources/test003.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-identity-anon-resources/test004.rdf"), remote_test_file_to_str("rdfms-identity-anon-resources/test004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-identity-anon-resources/test004.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-identity-anon-resources/test005.rdf"), remote_test_file_to_str("rdfms-identity-anon-resources/test005.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-identity-anon-resources/test005.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-not-id-and-resource-attr/test001.rdf"), remote_test_file_to_str("rdfms-not-id-and-resource-attr/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-not-id-and-resource-attr/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-not-id-and-resource-attr/test002.rdf"), remote_test_file_to_str("rdfms-not-id-and-resource-attr/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-not-id-and-resource-attr/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-not-id-and-resource-attr/test004.rdf"), remote_test_file_to_str("rdfms-not-id-and-resource-attr/test004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-not-id-and-resource-attr/test004.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-not-id-and-resource-attr/test005.rdf"), remote_test_file_to_str("rdfms-not-id-and-resource-attr/test005.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-not-id-and-resource-attr/test005.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-para196/test001.rdf"), remote_test_file_to_str("rdfms-para196/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-para196/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-001.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-002.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-002.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-003.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-003.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-004.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-004.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-005.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-005.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-005.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-006.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-006.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-006.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-007.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-007.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-007.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-008.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-008.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-008.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-009.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-009.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-009.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-010.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-010.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-010.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-011.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-011.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-011.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-012.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-012.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-012.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-013.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-013.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-013.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-014.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-014.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-014.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-015.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-015.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-015.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-016.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-016.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-016.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-017.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-017.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-017.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-018.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-018.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-018.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-019.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-019.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-019.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-020.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-020.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-020.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-021.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-021.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-021.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-022.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-022.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-022.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-023.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-023.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-023.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-024.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-024.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-024.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-025.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-025.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-025.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-026.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-026.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-026.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-027.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-027.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-027.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-028.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-028.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-028.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-029.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-029.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-029.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-030.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-030.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-030.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-031.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-031.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-031.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-032.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-032.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-032.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-033.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-033.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-033.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-034.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-034.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-034.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-035.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-035.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-035.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-036.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-036.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-036.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/test-037.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/test-037.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/test-037.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/warn-001.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/warn-001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/warn-001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/warn-002.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/warn-002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/warn-002.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-rdf-names-use/warn-003.rdf"), remote_test_file_to_str("rdfms-rdf-names-use/warn-003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/warn-003.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-reification-required/test001.rdf"), remote_test_file_to_str("rdfms-reification-required/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-reification-required/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-reification-required/test002.rdf"), remote_test_file_to_str("rdfms-reification-required/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-reification-required/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-seq-representation/test001.rdf"), remote_test_file_to_str("rdfms-seq-representation/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-seq-representation/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-seq-representation/test002.rdf"), remote_test_file_to_str("rdfms-seq-representation/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-seq-representation/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-syntax-incomplete/test001.rdf"), remote_test_file_to_str("rdfms-syntax-incomplete/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-syntax-incomplete/test002.rdf"), remote_test_file_to_str("rdfms-syntax-incomplete/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/test002.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-syntax-incomplete/test003.rdf"), remote_test_file_to_str("rdfms-syntax-incomplete/test003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/test003.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-syntax-incomplete/test004.rdf"), remote_test_file_to_str("rdfms-syntax-incomplete/test004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/test004.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-uri-substructure/test001.rdf"), remote_test_file_to_str("rdfms-uri-substructure/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-uri-substructure/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-xmllang/test003.rdf"), remote_test_file_to_str("rdfms-xmllang/test003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-xmllang/test003.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-xmllang/test004.rdf"), remote_test_file_to_str("rdfms-xmllang/test004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-xmllang/test004.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-xmllang/test005.rdf"), remote_test_file_to_str("rdfms-xmllang/test005.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-xmllang/test005.rdf"); + xml_test_positive(remote_test_file_to_str("rdfms-xmllang/test006.rdf"), remote_test_file_to_str("rdfms-xmllang/test006.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-xmllang/test006.rdf"); + xml_test_positive(remote_test_file_to_str("rdfs-domain-and-range/test001.rdf"), remote_test_file_to_str("rdfs-domain-and-range/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfs-domain-and-range/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdfs-domain-and-range/test002.rdf"), remote_test_file_to_str("rdfs-domain-and-range/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfs-domain-and-range/test002.rdf"); + // xml_test_positive(remote_test_file_to_str("unrecognised-xml-attributes/test001.rdf"), remote_test_file_to_str("unrecognised-xml-attributes/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/unrecognised-xml-attributes/test001.rdf"); + // xml_test_positive(remote_test_file_to_str("unrecognised-xml-attributes/test002.rdf"), remote_test_file_to_str("unrecognised-xml-attributes/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/unrecognised-xml-attributes/test002.rdf"); + // xml_test_positive(remote_test_file_to_str("xml-canon/test001.rdf"), remote_test_file_to_str("xml-canon/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xml-canon/test001.rdf"); + // xml_test_positive(remote_test_file_to_str("xml-canon/test002.rdf"), remote_test_file_to_str("xml-canon/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xml-canon/test002.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test001.rdf"), remote_test_file_to_str("xmlbase/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test001.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test002.rdf"), remote_test_file_to_str("xmlbase/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test002.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test003.rdf"), remote_test_file_to_str("xmlbase/test003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test003.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test004.rdf"), remote_test_file_to_str("xmlbase/test004.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test004.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test006.rdf"), remote_test_file_to_str("xmlbase/test006.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test006.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test007.rdf"), remote_test_file_to_str("xmlbase/test007.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test007.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test008.rdf"), remote_test_file_to_str("xmlbase/test008.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test008.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test009.rdf"), remote_test_file_to_str("xmlbase/test009.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test009.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test010.rdf"), remote_test_file_to_str("xmlbase/test010.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test010.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test011.rdf"), remote_test_file_to_str("xmlbase/test011.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test011.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test013.rdf"), remote_test_file_to_str("xmlbase/test013.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test013.rdf"); + xml_test_positive(remote_test_file_to_str("xmlbase/test014.rdf"), remote_test_file_to_str("xmlbase/test014.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test014.rdf"); + // negative tests + // xml_test_negative(remote_test_file_to_str("rdf-containers-syntax-vs-schema/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/error001.rdf"); + // xml_test_negative(remote_test_file_to_str("rdf-containers-syntax-vs-schema/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/error002.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-abouteach/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-abouteach/error001.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-abouteach/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-abouteach/error002.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-difference-between-ID-and-about/error1.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-difference-between-ID-and-about/error1.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-empty-property-elements/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/error001.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-empty-property-elements/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/error002.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error001.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error002.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error003.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error004.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error005.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error006.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error007.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error007.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-001.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-002.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-003.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-004.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-005.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-006.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-007.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-007.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-008.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-008.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-009.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-009.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-010.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-010.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-011.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-011.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-012.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-012.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-013.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-013.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-014.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-014.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-015.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-015.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-016.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-016.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-017.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-017.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-018.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-018.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-019.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-019.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-020.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-020.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error001.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error002.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error003.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error004.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error005.rdf"); + // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error006.rdf"); } From 95d11d689dd17736c3c06282c04684ec3ceda375 Mon Sep 17 00:00:00 2001 From: Nikolaos Karalis Date: Wed, 21 Jan 2026 10:28:35 +0100 Subject: [PATCH 40/42] added missing test dep --- conanfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/conanfile.py b/conanfile.py index a4891bb4..3376d451 100644 --- a/conanfile.py +++ b/conanfile.py @@ -44,6 +44,7 @@ def requirements(self): if self.options.with_test_deps: self.test_requires("doctest/2.4.11") self.test_requires("nanobench/4.3.11") + self.test_requires("libcurl/8.12.1") def set_name(self): if not hasattr(self, 'name') or self.version is None: From 4c26db91334dff9dedbcbfe7cc755d8b05a17aec Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 21 Jan 2026 16:27:12 +0100 Subject: [PATCH 41/42] tests --- private/rdf4cpp/parser/XMLParser.cpp | 23 + private/rdf4cpp/parser/XMLParserUtility.cpp | 48 +- private/rdf4cpp/parser/XMLParserUtility.hpp | 4 + .../XMLStates/XMLParserDescriptionState.cpp | 62 +- .../XMLStates/XMLParserInitialState.cpp | 8 +- src/rdf4cpp/util/CharMatcher.hpp | 55 ++ tests/parser/tests_XMLParser.cpp | 847 ++---------------- 7 files changed, 245 insertions(+), 802 deletions(-) diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index d55d3522..40f70431 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -53,6 +53,29 @@ namespace rdf4cpp::parser { return iri_equal_pieces(e, uri, local_name); }); } + bool iri_core_syntax(std::string_view const uri, std::string_view const local_name) { + static constexpr std::array reserved = { + xml_states::RDFState::start_element, + xml_states::DescriptionState::id_attrib, + xml_states::DescriptionState::about_attrib, + xml_states::PredicateState::parse_type_attrib, + xml_states::PredicateState::resource_attrib, + xml_states::DescriptionState::node_id_attrib, + }; + return std::ranges::any_of(reserved, [&](std::string_view const e) { + return iri_equal_pieces(e, uri, local_name); + }); + } + bool iri_old_term(std::string_view const uri, std::string_view const local_name) { + static constexpr std::array reserved = { + std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"}, + std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"}, + std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"}, + }; + return std::ranges::any_of(reserved, [&](std::string_view const e) { + return iri_equal_pieces(e, uri, local_name); + }); + } void IStreamQuadIterator::ImplXML::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) va_list args; diff --git a/private/rdf4cpp/parser/XMLParserUtility.cpp b/private/rdf4cpp/parser/XMLParserUtility.cpp index ddda1683..92c654f7 100644 --- a/private/rdf4cpp/parser/XMLParserUtility.cpp +++ b/private/rdf4cpp/parser/XMLParserUtility.cpp @@ -1,5 +1,6 @@ #include - +#include +#include namespace rdf4cpp::parser { XMLOutputQueue::XMLOutputQueue(state_type *state) : state_(state) { @@ -35,6 +36,9 @@ namespace rdf4cpp::parser { void XMLOutputQueue::add_error(ParsingError::Type ty, std::string msg, XMLStateInfo const &i) { result_queue_.emplace_back(nonstd::unexpect, ty, i.line, i.column, std::move(msg)); } + void XMLOutputQueue::add_old_term_error(XMLStateInfo const &i) { + add_error(ParsingError::Type::BadSyntax, "rdf:bagID, rdf:aboutEach and rdf:aboutEachPrefix were removed", i); + } void XMLOutputQueue::add_statement(Node subject, IRI predicate, Node object, IRI reify) { if (subject.null() || predicate.null() || object.null()) { @@ -91,7 +95,27 @@ namespace rdf4cpp::parser { return make_iri(iri, base, i); } + bool is_ncname(std::string_view v) { + using namespace util::char_matcher_detail; + + if (v.empty()) { + return false; + } + if (!match(v)) { + return false; + } + auto r = v | una::views::utf8; + if (r.begin() == r.end()) { + return false; + } + return xml::NCNameStartChar.match(static_cast(*r.begin())); + } + IRI XMLOutputQueue::make_id(std::string_view const local_name, std::string_view const base, XMLStateInfo const &i) { + if (!is_ncname(local_name)) { + add_error(ParsingError::Type::BadIri, std::format("{}: is not a valid NCName (required for rdf:ID)", local_name), i); + return IRI::make_null(); + } std::string local = "#"; local.append(local_name); auto iri = make_iri(local, base, i); @@ -106,9 +130,13 @@ namespace rdf4cpp::parser { Node XMLOutputQueue::make_bn(std::optional name, XMLStateInfo const &i) { std::string n = ""; if (!name.has_value()) { - n = std::format("bn_{}", next_bn_index_++); + n = std::format("{}_bn", next_bn_index_++); name = n; } + else if (!is_ncname(*name)) { + add_error(ParsingError::Type::BadIri, std::format("{}: is not a valid NCName (required for rdf:nodeID)", *name), i); + return IRI::make_null(); + } try { if (state_->blank_node_scope_manager == nullptr) { return inspect_node(BlankNode::make(*name, state_->node_storage), i); @@ -163,5 +191,21 @@ namespace rdf4cpp::parser { } return full_iri.starts_with(uri) && full_iri.ends_with(local_name); } + bool iri_in_xml_namespace(std::string_view uri, std::string_view local_name) { + static constexpr std::string_view xml_namespace = "http://www.w3.org/XML/1998/namespace"; + if (uri.length() + local_name.length() >= xml_namespace.length()) { + if (uri != xml_namespace.substr(0, uri.length())) { + return false; + } + if (uri.length() < xml_namespace.length() && !local_name.starts_with(xml_namespace.substr(uri.length()))) { + return false; + } + return true; + } + if (uri.empty() && local_name.starts_with("xml")) { + return true; + } + return false; + } } // namespace rdf4cpp::parser \ No newline at end of file diff --git a/private/rdf4cpp/parser/XMLParserUtility.hpp b/private/rdf4cpp/parser/XMLParserUtility.hpp index 15e99680..cfe949ab 100644 --- a/private/rdf4cpp/parser/XMLParserUtility.hpp +++ b/private/rdf4cpp/parser/XMLParserUtility.hpp @@ -103,6 +103,7 @@ namespace rdf4cpp::parser { [[nodiscard]] std::string_view current_base_iri() const; void add_error(ParsingError::Type ty, std::string msg, XMLStateInfo const &i); + void add_old_term_error(XMLStateInfo const &i); /** * add statement to the output list, if none of the components is null * (null is used to track an already inserted parse error for that component) @@ -142,6 +143,9 @@ namespace rdf4cpp::parser { [[nodiscard]] std::string_view trim_left(std::string_view v); [[nodiscard]] bool iri_equal_pieces(std::string_view full_iri, std::string_view uri, std::string_view local_name); [[nodiscard]] bool iri_reserved(std::string_view uri, std::string_view local_name); + [[nodiscard]] bool iri_core_syntax(std::string_view uri, std::string_view local_name); + [[nodiscard]] bool iri_old_term(std::string_view uri, std::string_view local_name); + [[nodiscard]] bool iri_in_xml_namespace(std::string_view uri, std::string_view local_name); } // namespace rdf4cpp::parser #endif //RDF4CPP_XMLPARSERSTATECOLLECTOR_H diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp index fbc79a60..0a368ccd 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp @@ -11,6 +11,19 @@ namespace rdf4cpp::parser::xml_states { } StateTransition DescriptionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span attributes, XMLStateInfo const &info) { + if (iri_core_syntax(uri, local_name)) { + out.add_error(ParsingError::Type::BadSyntax, "core syntax terms are not allowed as predicates", info); + return StateTransition(std::in_place_type_t{}); + } + if (iri_equal_pieces(start_element, uri, local_name)) { + out.add_error(ParsingError::Type::BadSyntax, "rdf:Description is not allowed as predicate", info); + return StateTransition(std::in_place_type_t{}); + } + if (iri_old_term(uri, local_name)) { + out.add_old_term_error(info); + return StateTransition(std::in_place_type_t{}); + } + auto const inherited_attribute_info = get_inherited_attributes(out, attributes, info); IRI predicate; if (iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { @@ -20,6 +33,13 @@ namespace rdf4cpp::parser::xml_states { } std::optional datatype = std::nullopt; std::optional sub = std::nullopt; + auto check_only_one = [&sub, &out, &info]() { + if (sub.has_value()) { + out.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:ID, rdf:about, and rdf:nodeID", info); + return true; + } + return false; + }; IRI reify = IRI::make_null(); bool parse_resource = false; bool parse_literal = false; @@ -28,8 +48,10 @@ namespace rdf4cpp::parser::xml_states { if (iri_equal_pieces(TypedLiteralPredicateState::datatype_attrib, att.uri(), att.local_name())) { datatype = out.make_iri(att.value(), inherited_attribute_info.base, info); } else if (iri_equal_pieces(PredicateState::resource_attrib, att.uri(), att.local_name())) { + check_only_one(); sub = out.make_iri(att.value(), inherited_attribute_info.base, info); } else if (iri_equal_pieces(node_id_attrib, att.uri(), att.local_name())) { + check_only_one(); sub = out.make_bn(att.value(), info); } else if (iri_equal_pieces(id_attrib, att.uri(), att.local_name())) { reify = out.make_id(att.value(), inherited_attribute_info.base, info); @@ -44,9 +66,21 @@ namespace rdf4cpp::parser::xml_states { } } for (auto const &att : attributes) { + if (iri_equal_pieces(PredicateState::list_start_element, att.uri(), att.local_name())) { + out.add_error(ParsingError::Type::BadSyntax, "rdf:li is not allowed as attribute", info); + continue; + } + if (iri_old_term(att.uri(), att.local_name())) { + out.add_old_term_error(info); + continue; + } if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { continue; } + // the only reference i found to this is: https://github.com/w3c/rdf-tests/blob/main/rdf/rdf11/rdf-xml/unrecognised-xml-attributes/test001.rdf + if (iri_in_xml_namespace(att.uri(), att.local_name())) { + continue; + } if (!sub.has_value()) { sub = out.make_bn(std::nullopt, info); } @@ -119,15 +153,37 @@ namespace rdf4cpp::parser::xml_states { sub = out.make_bn(std::nullopt, info); } if (!iri_equal_pieces(start_element, uri, local_name)) { - IRI const obj = out.make_iri(uri, local_name, inherited_attribute_info.base, info); - if (!obj.null()) { - out.add_statement(sub, out.make_type_iri(), obj, IRI::make_null()); + if (iri_equal_pieces(PredicateState::list_start_element, uri, local_name)) { + out.add_error(ParsingError::Type::BadSyntax, "rdf:li is not allowed as element type", info); + } + else if (iri_core_syntax(uri, local_name)) { + out.add_error(ParsingError::Type::BadSyntax, "core syntax terms are not allowed as element type", info); + } + else if (iri_old_term( uri, local_name)) { + out.add_old_term_error(info); + } + else { + IRI const obj = out.make_iri(uri, local_name, inherited_attribute_info.base, info); + if (!obj.null()) { + out.add_statement(sub, out.make_type_iri(), obj, IRI::make_null()); + } } } for (auto const &att : attributes) { + if (iri_equal_pieces(PredicateState::list_start_element, att.uri(), att.local_name())) { + out.add_error(ParsingError::Type::BadSyntax, "rdf:li is not allowed as attribute", info); + continue; + } + if (iri_old_term(att.uri(), att.local_name())) { + out.add_old_term_error(info); + continue; + } if (PredicateState::iri_reserved_predicate(att.uri(), att.local_name())) { continue; } + if (iri_in_xml_namespace(att.uri(), att.local_name())) { + continue; + } if (iri_equal_pieces(type_attrib, att.uri(), att.local_name())) { IRI const obj = out.make_iri(att.value(), inherited_attribute_info.base, info); out.add_statement(sub, out.make_type_iri(), obj, IRI::make_null()); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp index 30eab4ce..56c24f3e 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp @@ -5,7 +5,7 @@ namespace rdf4cpp::parser::xml_states { StateTransition InitialState::on_characters(XMLOutputQueue &out, std::string_view const chars, XMLStateInfo const &info) { if (!trim_left(chars).empty()) { - out.add_error(ParsingError::Type::BadSyntax, "expected RDF, found characters", info); + out.add_error(ParsingError::Type::BadSyntax, "expected RDF or Description, found characters", info); } return {}; } @@ -17,12 +17,12 @@ namespace rdf4cpp::parser::xml_states { get_inherited_attributes(out, attributes, info), }; } - out.add_error(ParsingError::Type::BadSyntax, "expected RDF, found ???", info); - return {}; + auto [trans, _] = DescriptionState::enter(out, local_name, uri, attributes, info); + return trans; } StateTransition InitialState::on_end_element(XMLOutputQueue &out, XMLStateInfo const &info) { - out.add_error(ParsingError::Type::BadSyntax, "expected RDF, found end of ???", info); + out.add_error(ParsingError::Type::BadSyntax, "expected RDF or Description, found end of initial state?", info); return {}; } void InitialState::move_to(BaseState *b) noexcept { diff --git a/src/rdf4cpp/util/CharMatcher.hpp b/src/rdf4cpp/util/CharMatcher.hpp index 2ce19c93..d178bf3a 100644 --- a/src/rdf4cpp/util/CharMatcher.hpp +++ b/src/rdf4cpp/util/CharMatcher.hpp @@ -327,6 +327,61 @@ struct PNChars_UnicodePartMatcher { */ constexpr auto PNCharsMatcher = ASCIINumMatcher{} | ASCIIPatternMatcher{"-"} | PNCharsUMatcher | PNChars_UnicodePartMatcher{}; +namespace xml { + /** + * Matches the unicode part (the characters listed as numbers) of NCNameStartChar of the XML specification + */ + struct NCNameStartChar_UnicodePartMatcher { + [[nodiscard]] static constexpr bool match(int c) noexcept { + return (c >= 0xC0 && c <= 0xD6) || + (c >= 0xD8 && c <= 0xF6) || + (c >= 0xF8 && c <= 0x2FF) || + (c >= 0x370 && c <= 0x37D) || + (c >= 0x37F && c <= 0x1FFF) || + (c >= 0x200C && c <= 0x200D) || + (c >= 0x2070 && c <= 0x218F) || + (c >= 0x2C00 && c <= 0x2FEF) || + (c >= 0x3001 && c <= 0xD7FF) || + (c >= 0xF900 && c <= 0xFDCF) || + (c >= 0xFDF0 && c <= 0xFFFD) || + (c >= 0x10000 && c <= 0xEFFFF); + } + + static constexpr size_t simd_range_num = 0; + static constexpr bool fail_if_unicode = false; + [[nodiscard]] static consteval std::array simd_ranges() noexcept { + return {}; + } + [[nodiscard]] static consteval auto simd_singles() noexcept { + return datatypes::registry::util::ConstexprString(""); + } + }; + + + /** + * Matches the unicode part (the characters listed as numbers) of NCNameChar of the XML specification + */ + struct NCNameChar_UnicodePartMatcher { + [[nodiscard]] static constexpr bool match(int c) noexcept { + return c == 0xB7 || + (c >= 0x0300 && c <= 0x036F) || + (c >= 0x203F && c <= 0x2040); + } + + static constexpr size_t simd_range_num = 0; + static constexpr bool fail_if_unicode = false; + [[nodiscard]] static consteval std::array simd_ranges() noexcept { + return {}; + } + [[nodiscard]] static consteval auto simd_singles() noexcept { + return datatypes::registry::util::ConstexprString(""); + } + }; + + constexpr auto NCNameStartChar = ASCIIAlphaMatcher{} | ASCIIPatternMatcher{"_"} | NCNameStartChar_UnicodePartMatcher{}; + constexpr auto NCNameChar = ASCIIAlphaMatcher{} | ASCIINumMatcher{} | ASCIIPatternMatcher{"_-."} | NCNameStartChar_UnicodePartMatcher{} | NCNameChar_UnicodePartMatcher{}; +} + /** * iterates over s and tries to match all in m. * attempts to do an ASCII SIMD match first, if that does not decide the matching, decodes the utf-8 and matches char by char. diff --git a/tests/parser/tests_XMLParser.cpp b/tests/parser/tests_XMLParser.cpp index 15dcf204..e69b20c4 100644 --- a/tests/parser/tests_XMLParser.cpp +++ b/tests/parser/tests_XMLParser.cpp @@ -140,754 +140,6 @@ TEST_CASE("sanity test") { CHECK(it == std::default_sentinel); } -TEST_CASE("rdf xml positive tests") { - // adapted from https://github.com/w3c/rdf-tests/tree/main/rdf/rdf11/rdf-xml - - std::string xml = ""; - std::string nt = ""; - - - SUBCASE("syntax 1 (base applies to id)") { - xml = R"( - - - - -)"; - nt = R"( "v" .)"; - } - SUBCASE("syntax 2 (base applies to resource)") { - xml = R"( - - - - - - -)"; - nt = R"( .)"; - } - SUBCASE("syntax 3 (base applies to about)") { - xml = R"( - - - - -)"; - nt = R"( .)"; - } - // case 4 needs reification - SUBCASE("syntax 6 (base scoping)") { - xml = R"( - - - - - -)"; - nt = R"( "v" . - .)"; - } - SUBCASE("syntax 7 (relative resolution)") { - xml = R"( - - - - - -)"; - nt = R"( .)"; - } - SUBCASE("syntax 8 (empty local)") { - xml = R"( - - - - - -)"; - nt = R"( .)"; - } - SUBCASE("syntax 9 (absolute path)") { - xml = R"( - - - - - -)"; - nt = R"( .)"; - } - SUBCASE("syntax 10 (absolute host)") { - xml = R"( - - - - - -)"; - nt = R"( .)"; - } - SUBCASE("syntax 11 (base without path)") { - xml = R"( - - - - - -)"; - nt = R"( .)"; - } - SUBCASE("syntax 13 (base with fragment)") { - xml = R"( - - - - - - - - -)"; - nt = R"( . - .)"; - } - SUBCASE("syntax 14 (same ids)") { - xml = R"( - - - - - - -)"; - nt = R"( "v" . - "v" .)"; - } - SUBCASE("amp") { - xml = R"( - - - - xxx - - - xxx - - -)"; - nt = R"( "xxx" . - "xxx" .)"; - } - SUBCASE("datatypes") { - xml = R"( - - - - 10 - 10 - - -)"; - nt = R"( "10"^^ . - "10"^^ .)"; - } - SUBCASE("unicode literal") { - xml = R"( - - - - - - - - - -)"; - nt = R"(_:a "D\u00FCrst" . - _:a .)"; - } - SUBCASE("unicode iri 1") { - xml = R"( - - - - - - 2000 - -)"; - nt = R"( "2000" .)"; - } - SUBCASE("unicode iri 2") { - xml = R"( - - - - - - 2000 - -)"; - nt = R"( "2000" .)"; - } - SUBCASE("type instead of description") { - xml = R"( - - - - Dogs in Hats - - -)"; - nt = R"( . - "Dogs in Hats" .)"; - } - SUBCASE("id 1") { - xml = R"( - - - abc - -)"; - nt = R"( "abc" .)"; - } - SUBCASE("id 2") { - xml = R"( - - - abc - -)"; - nt = R"( "abc" .)"; - } - SUBCASE("id 3") { - xml = R"( - - - abc - -)"; - nt = R"( "abc" .)"; - } - SUBCASE("duplicate bag entries") { - xml = R"( - - - - - -)"; - nt = R"( . - . - .)"; - } - SUBCASE("empty property 1") { - xml = R"( - - - - - - -)"; - nt = R"( .)"; - } - SUBCASE("empty property 2") { - xml = R"( - - - - - - -)"; - nt = R"( "" .)"; - } - SUBCASE("empty property 3") { - xml = R"( - - - - - - -)"; - nt = R"( ""^^ .)"; - } - SUBCASE("empty property 4") { - xml = R"( - - - - - - -)"; - nt = R"( _:a1 .)"; - } - SUBCASE("empty property 13") { - xml = R"( - - - - - -)"; - nt = R"( "baz" . - .)"; - } - SUBCASE("blank node identity") { - xml = R"( - - - - property value - - -)"; - nt = R"(_:j0 . -_:j0 "property value" .)"; - } - SUBCASE("blank node identity 2") { - xml = R"( - - - - - - - - - - - - - -)"; - nt = R"(_:j0A _:j0A . -_:j2 _:j1B . -_:j1B _:j0A .)"; - } - SUBCASE("collection") { - xml = R"( - - - - - - - - - - -)"; - nt = R"( _:a0 . -_:a0 _:a1 . -_:a1 . -_:a1 _:a2 . -_:a2 . -_:a2 .)"; - } - SUBCASE("nested reify") { - xml = R"( - - - - - - - - - -)"; - nt = R"( - . - . - . - . - . - . - . - . - . - .)"; - } - SUBCASE("reify target") { - xml = R"( - - - - - -)"; - nt = R"(_:j88091 "val" . -_:j88090 _:j88091 . - _:j88090 . - . - _:j88091 . - .)"; - } - SUBCASE("reify collection") { - xml = R"( - - - - - - - - - - -)"; - nt = R"( _:a0 . -_:a0 _:a1 . - _:a0 . - . - _:a1 . - . -_:a1 . -_:a1 _:a2 . -_:a2 . -_:a2 .)"; - } - SUBCASE("reify literal") { - xml = R"( - - - - v - - -)"; - nt = R"(_:j0 "v" . - _:j0 . - . - "v" . - .)"; - } - SUBCASE("lang literal") { - xml = R"( - - - - chat - -)"; - nt = R"( "chat"@fr .)"; - } - SUBCASE("lang literal attribute") { - xml = R"( - - - - -)"; - nt = R"( "chat"@fr .)"; - } - SUBCASE("simple list") { - xml = R"( - - - - 1 - 2 - -)"; - nt = R"(_:bag . -_:bag "1" . -_:bag "2" .)"; - } - SUBCASE("list interference") { - xml = R"( - - - - - _1 - 1 - _3 - 2 - -)"; - nt = R"(_:bag . -_:bag "_1" . -_:bag "1" . -_:bag "_3" . -_:bag "2" .)"; - } - SUBCASE("list advanced") { - xml = R"( - - - - - 1 - 2 - - - - - -)"; - nt = R"(_:bar . -_:bar "1" . - _:bar . - . - "1" . - . -_:bar "2"^^ . -_:bar _:res . -_:res . -_:res2 "foobar" . -_:bar _:res2 . - _:bar . - . - _:res2 . - .)"; - } - SUBCASE("list other") { - xml = R"( - - - - - barfoo - - -)"; - nt = R"( . - "3" . - "foobar" . - . - "2" . - "foobar" . - "barfoo" . -_:bag .)"; - } - SUBCASE("list independence") { - xml = R"( - - - - - - 1 - 2 - - - 2 - -)"; - nt = R"(_:d1 _:d2 . - -_:d2 "1" . -_:d2 "2" . - -_:d1 "2" .)"; - } - SUBCASE("list per element") { - xml = R"( - - - - 1 - - - - 1-again - -)"; - nt = R"( "1" . - "1-again" .)"; - } - SUBCASE("xml literal") { - xml = R"( - - - - -
-
- -
)"; - nt = R"( "
"^^ .)"; - } - - if (xml.empty()) { - return; - } - - std::stringstream xml_str{xml}; - IStreamQuadIterator xml_iter{xml_str, ParsingFlag::RdfXml}; - - std::stringstream nt_str{nt}; - IStreamQuadIterator nt_iter{nt_str, ParsingFlag::NTriples}; - - std::map bn_map{}; - auto check = [&bn_map](Node xml, Node nt) { - if (nt.is_blank_node() && xml.is_blank_node()) { - auto i = bn_map.find(nt.as_blank_node()); - if (i != bn_map.end()) { - CHECK(xml.as_blank_node() == i->second.as_blank_node()); - } - else { - bn_map[nt.as_blank_node()] = xml.as_blank_node(); - } - } - else { - CHECK(xml == nt); - } - }; - - while (nt_iter != std::default_sentinel) { - REQUIRE(xml_iter != std::default_sentinel); - if (!xml_iter->has_value()) { - FAIL(xml_iter->error().message); - } - REQUIRE(nt_iter->has_value()); - check(xml_iter->value().subject() , nt_iter->value().subject()); - check(xml_iter->value().predicate(), nt_iter->value().predicate()); - check(xml_iter->value().object(), nt_iter->value().object()); - - ++xml_iter; - ++nt_iter; - } - - REQUIRE(xml_iter == std::default_sentinel); -} - -TEST_CASE("rdf xml negative tests") { - // adapted from https://github.com/w3c/rdf-tests/tree/main/rdf/rdf11/rdf-xml - std::string xml = ""; - std::vector> expected_msg{}; - bool ignore_some_triples = false; - - SUBCASE("resource + parse type") { - xml = R"( - - - - - -)"; - expected_msg.emplace_back(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); - ignore_some_triples = true; - } - SUBCASE("implicit bn + parse type") { - xml = R"( - - - - - -)"; - expected_msg.emplace_back(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource"); - ignore_some_triples = true; - } - SUBCASE("multiple ids") { - xml = R"( - - - abc - - - abc - -)"; - expected_msg.emplace_back(ParsingError::Type::BadIri, ": is already used as a rdf:ID"); - ignore_some_triples = true; - } - - if (xml.empty()) { - return; - } - - std::stringstream xml_str{xml}; - IStreamQuadIterator xml_iter{xml_str, ParsingFlag::RdfXml}; - - while (xml_iter != std::default_sentinel) { - if (!ignore_some_triples) { - REQUIRE(!xml_iter->has_value()); - } else if (xml_iter->has_value()) { - ++xml_iter; - continue; - } - REQUIRE(!expected_msg.empty()); - CHECK(xml_iter->error().error_type == expected_msg.back().first); - CHECK(xml_iter->error().message == expected_msg.back().second); - expected_msg.pop_back(); - ++xml_iter; - } - REQUIRE(expected_msg.empty()); -} - void xml_test_positive(std::string xml_str, std::string nt_str, std::string_view base_iri) { CAPTURE(base_iri); @@ -935,6 +187,15 @@ void xml_test_positive(std::string xml_str, std::string nt_str, std::string_view if (a_bl != b_bl) { return std::less{}(a_bl, b_bl); } + if (a.subject() != b.subject() && !a.subject().is_blank_node() && !b.subject().is_blank_node()) { + return std::less{}(a.subject(), b.subject()); + } + if (a.predicate() != b.predicate() && !a.predicate().is_blank_node() && !b.predicate().is_blank_node()) { + return std::less{}(a.predicate(), b.predicate()); + } + if (!a.object().is_blank_node() && !b.object().is_blank_node()) { + return std::less{}(a.object(), b.object()); + } if (a.subject() != b.subject()) { return std::less{}(a.subject(), b.subject()); } @@ -1018,7 +279,7 @@ TEST_CASE("test cases from rdf-tests") { // positive tests xml_test_positive(remote_test_file_to_str("amp-in-url/test001.rdf"), remote_test_file_to_str("amp-in-url/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/amp-in-url/test001.rdf"); xml_test_positive(remote_test_file_to_str("datatypes/test001.rdf"), remote_test_file_to_str("datatypes/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/datatypes/test001.rdf"); - // xml_test_positive(remote_test_file_to_str("datatypes/test002.rdf"), remote_test_file_to_str("datatypes/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/datatypes/test002.rdf"); + // xml_test_positive(remote_test_file_to_str("datatypes/test002.rdf"), remote_test_file_to_str("datatypes/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/datatypes/test002.rdf"); // invalid integer xml_test_positive(remote_test_file_to_str("rdf-charmod-literals/test001.rdf"), remote_test_file_to_str("rdf-charmod-literals/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-charmod-literals/test001.rdf"); xml_test_positive(remote_test_file_to_str("rdf-charmod-uris/test001.rdf"), remote_test_file_to_str("rdf-charmod-uris/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-charmod-uris/test001.rdf"); xml_test_positive(remote_test_file_to_str("rdf-charmod-uris/test002.rdf"), remote_test_file_to_str("rdf-charmod-uris/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-charmod-uris/test002.rdf"); @@ -1029,7 +290,7 @@ TEST_CASE("test cases from rdf-tests") { xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test006.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test006.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test006.rdf"); xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test007.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test007.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test007.rdf"); xml_test_positive(remote_test_file_to_str("rdf-containers-syntax-vs-schema/test008.rdf"), remote_test_file_to_str("rdf-containers-syntax-vs-schema/test008.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/test008.rdf"); - // xml_test_positive(remote_test_file_to_str("rdf-element-not-mandatory/test001.rdf"), remote_test_file_to_str("rdf-element-not-mandatory/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-element-not-mandatory/test001.rdf"); + xml_test_positive(remote_test_file_to_str("rdf-element-not-mandatory/test001.rdf"), remote_test_file_to_str("rdf-element-not-mandatory/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-element-not-mandatory/test001.rdf"); xml_test_positive(remote_test_file_to_str("rdf-node-element/test001.rdf"), remote_test_file_to_str("rdf-node-element/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-node-element/test001.rdf"); xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0001.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0001.rdf"); xml_test_positive(remote_test_file_to_str("rdf-ns-prefix-confusion/test0003.rdf"), remote_test_file_to_str("rdf-ns-prefix-confusion/test0003.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-ns-prefix-confusion/test0003.rdf"); @@ -1126,9 +387,9 @@ TEST_CASE("test cases from rdf-tests") { xml_test_positive(remote_test_file_to_str("rdfms-xmllang/test006.rdf"), remote_test_file_to_str("rdfms-xmllang/test006.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-xmllang/test006.rdf"); xml_test_positive(remote_test_file_to_str("rdfs-domain-and-range/test001.rdf"), remote_test_file_to_str("rdfs-domain-and-range/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfs-domain-and-range/test001.rdf"); xml_test_positive(remote_test_file_to_str("rdfs-domain-and-range/test002.rdf"), remote_test_file_to_str("rdfs-domain-and-range/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfs-domain-and-range/test002.rdf"); - // xml_test_positive(remote_test_file_to_str("unrecognised-xml-attributes/test001.rdf"), remote_test_file_to_str("unrecognised-xml-attributes/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/unrecognised-xml-attributes/test001.rdf"); - // xml_test_positive(remote_test_file_to_str("unrecognised-xml-attributes/test002.rdf"), remote_test_file_to_str("unrecognised-xml-attributes/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/unrecognised-xml-attributes/test002.rdf"); - // xml_test_positive(remote_test_file_to_str("xml-canon/test001.rdf"), remote_test_file_to_str("xml-canon/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xml-canon/test001.rdf"); + xml_test_positive(remote_test_file_to_str("unrecognised-xml-attributes/test001.rdf"), remote_test_file_to_str("unrecognised-xml-attributes/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/unrecognised-xml-attributes/test001.rdf"); + xml_test_positive(remote_test_file_to_str("unrecognised-xml-attributes/test002.rdf"), remote_test_file_to_str("unrecognised-xml-attributes/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/unrecognised-xml-attributes/test002.rdf"); + // xml_test_positive(remote_test_file_to_str("xml-canon/test001.rdf"), remote_test_file_to_str("xml-canon/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xml-canon/test001.rdf"); // XMLLiteral is not exactly as the spec defines // xml_test_positive(remote_test_file_to_str("xml-canon/test002.rdf"), remote_test_file_to_str("xml-canon/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xml-canon/test002.rdf"); xml_test_positive(remote_test_file_to_str("xmlbase/test001.rdf"), remote_test_file_to_str("xmlbase/test001.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test001.rdf"); xml_test_positive(remote_test_file_to_str("xmlbase/test002.rdf"), remote_test_file_to_str("xmlbase/test002.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test002.rdf"); @@ -1143,44 +404,44 @@ TEST_CASE("test cases from rdf-tests") { xml_test_positive(remote_test_file_to_str("xmlbase/test013.rdf"), remote_test_file_to_str("xmlbase/test013.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test013.rdf"); xml_test_positive(remote_test_file_to_str("xmlbase/test014.rdf"), remote_test_file_to_str("xmlbase/test014.nt"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/xmlbase/test014.rdf"); // negative tests - // xml_test_negative(remote_test_file_to_str("rdf-containers-syntax-vs-schema/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/error001.rdf"); - // xml_test_negative(remote_test_file_to_str("rdf-containers-syntax-vs-schema/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/error002.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-abouteach/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-abouteach/error001.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-abouteach/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-abouteach/error002.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-difference-between-ID-and-about/error1.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-difference-between-ID-and-about/error1.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-empty-property-elements/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/error001.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-empty-property-elements/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/error002.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error001.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error002.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error003.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error004.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error005.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error006.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error007.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error007.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-001.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-002.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-003.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-004.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-005.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-006.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-007.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-007.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-008.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-008.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-009.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-009.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-010.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-010.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-011.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-011.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-012.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-012.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-013.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-013.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-014.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-014.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-015.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-015.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-016.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-016.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-017.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-017.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-018.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-018.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-019.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-019.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-020.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-020.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error001.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error002.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error003.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error004.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error005.rdf"); - // xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error006.rdf"); + xml_test_negative(remote_test_file_to_str("rdf-containers-syntax-vs-schema/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/error001.rdf"); + xml_test_negative(remote_test_file_to_str("rdf-containers-syntax-vs-schema/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdf-containers-syntax-vs-schema/error002.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-abouteach/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-abouteach/error001.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-abouteach/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-abouteach/error002.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-difference-between-ID-and-about/error1.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-difference-between-ID-and-about/error1.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-empty-property-elements/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/error001.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-empty-property-elements/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-empty-property-elements/error002.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error001.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error002.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error003.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error004.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error005.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error006.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-id/error007.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-id/error007.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-001.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-002.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-003.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-004.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-005.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-006.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-007.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-007.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-008.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-008.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-009.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-009.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-010.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-010.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-011.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-011.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-012.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-012.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-013.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-013.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-014.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-014.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-015.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-015.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-016.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-016.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-017.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-017.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-018.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-018.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-019.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-019.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-rdf-names-use/error-020.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-rdf-names-use/error-020.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error001.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error001.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error002.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error002.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error003.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error003.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error004.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error004.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error005.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error005.rdf"); + xml_test_negative(remote_test_file_to_str("rdfms-syntax-incomplete/error006.rdf"), "https://w3c.github.io/rdf-tests/rdf/rdf11/rdf-xml/rdfms-syntax-incomplete/error006.rdf"); } From fc3e9adc040367ca684b2b302bfc8c0080c47b12 Mon Sep 17 00:00:00 2001 From: mcb Date: Wed, 21 Jan 2026 17:04:31 +0100 Subject: [PATCH 42/42] review --- private/rdf4cpp/parser/XMLParser.cpp | 7 +++-- .../parser/XMLParserStateTransition.hpp | 10 +------ .../XMLStates/XMLParserCollectionState.cpp | 4 ++- .../XMLStates/XMLParserDescriptionState.cpp | 26 ++++++++++--------- .../XMLStates/XMLParserEmptyElement.cpp | 2 +- .../XMLStates/XMLParserInitialState.cpp | 2 +- .../XMLStates/XMLParserPredicateState.cpp | 2 +- .../parser/XMLStates/XMLParserRDFState.cpp | 2 +- .../XMLParserTypedLiteralPredicateState.cpp | 2 +- .../XMLStates/XMLParserXMLLiteralState.cpp | 6 ++++- .../XMLStates/XMLParserXMLLiteralState.hpp | 3 +++ 11 files changed, 36 insertions(+), 30 deletions(-) diff --git a/private/rdf4cpp/parser/XMLParser.cpp b/private/rdf4cpp/parser/XMLParser.cpp index 40f70431..cc952651 100644 --- a/private/rdf4cpp/parser/XMLParser.cpp +++ b/private/rdf4cpp/parser/XMLParser.cpp @@ -79,10 +79,12 @@ namespace rdf4cpp::parser { void IStreamQuadIterator::ImplXML::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) va_list args; + va_list args_copy; auto t = static_cast(th); va_start(args, msg); // NOLINT(*-pro-bounds-array-to-pointer-decay) + va_copy(args_copy, args); // NOLINT(*-pro-bounds-array-to-pointer-decay) std::string out{}; - out.resize(1024, '\0'); + out.resize(1+vsnprintf(nullptr, 0, msg, args_copy), '\0'); // NOLINT(*-pro-bounds-array-to-pointer-decay) auto l = vsnprintf(out.data(), out.size(), msg, args); // NOLINT(*-pro-bounds-array-to-pointer-decay) if (l > 0) { out.resize(l); @@ -91,6 +93,7 @@ namespace rdf4cpp::parser { } t->output_.add_error(ParsingError::Type::BadSyntax, std::move(out), t->make_info()); va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) + va_end(args_copy); // NOLINT(*-pro-bounds-array-to-pointer-decay) } xmlEntity *IStreamQuadIterator::ImplXML::get_entity(void *, xmlChar const *e) { return xmlGetPredefinedEntity(e); @@ -153,7 +156,7 @@ namespace rdf4cpp::parser { output_(state) { xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); state_stack_.reserve(10); - state_stack_.emplace_back(std::in_place_type_t{}); + state_stack_.emplace_back(std::in_place_type); current_state().base = output_.current_base_iri(); } diff --git a/private/rdf4cpp/parser/XMLParserStateTransition.hpp b/private/rdf4cpp/parser/XMLParserStateTransition.hpp index 11fecca4..d6cf0c39 100644 --- a/private/rdf4cpp/parser/XMLParserStateTransition.hpp +++ b/private/rdf4cpp/parser/XMLParserStateTransition.hpp @@ -2,21 +2,13 @@ #define RDF4CPP_XMLPARSERSTATETRANSITION_H #include - #include - #include - #include - #include - #include - #include - #include - #include namespace rdf4cpp::parser { @@ -31,7 +23,7 @@ namespace rdf4cpp::parser { explicit StateTransition(Args &&...args) : modify_state(std::forward(args)...) { } - StateTransition() noexcept : StateTransition(std::in_place_type_t{}) { + StateTransition() noexcept : StateTransition(std::in_place_type) { } }; } diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp index 2e77a2a4..424c93d1 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserCollectionState.cpp @@ -11,6 +11,8 @@ namespace rdf4cpp::parser::xml_states { } StateTransition CollectionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span const attributes, XMLStateInfo const &info) { + // https://www.w3.org/TR/rdf11-xml/#parseTypeCollectionPropertyElt + // only node elements (=>DescriptionState) can appear in the list auto [transition, obj] = DescriptionState::enter(out, local_name, uri, attributes, info); if (first) { first = false; @@ -32,7 +34,7 @@ namespace rdf4cpp::parser::xml_states { } else { out.add_statement(last_bn, out.make_hardcoded_iri(iri_rest), nil, IRI::make_null()); } - return StateTransition{std::in_place_type_t{}}; + return StateTransition{std::in_place_type}; } void CollectionState::move_to(BaseState *b) noexcept { new (b) CollectionState(std::move(*this)); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp index 0a368ccd..adedc2d6 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserDescriptionState.cpp @@ -13,15 +13,15 @@ namespace rdf4cpp::parser::xml_states { StateTransition DescriptionState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, std::span attributes, XMLStateInfo const &info) { if (iri_core_syntax(uri, local_name)) { out.add_error(ParsingError::Type::BadSyntax, "core syntax terms are not allowed as predicates", info); - return StateTransition(std::in_place_type_t{}); + return StateTransition(std::in_place_type); } if (iri_equal_pieces(start_element, uri, local_name)) { out.add_error(ParsingError::Type::BadSyntax, "rdf:Description is not allowed as predicate", info); - return StateTransition(std::in_place_type_t{}); + return StateTransition(std::in_place_type); } if (iri_old_term(uri, local_name)) { out.add_old_term_error(info); - return StateTransition(std::in_place_type_t{}); + return StateTransition(std::in_place_type); } auto const inherited_attribute_info = get_inherited_attributes(out, attributes, info); @@ -60,11 +60,13 @@ namespace rdf4cpp::parser::xml_states { parse_resource = true; } else if (att.value() == PredicateState::parse_type_collection) { parse_collection = true; - } else { + } else { // literal is the default case thats supposed to be used if anything unknown appears parse_literal = true; } } } + // need to loop twice, because anything in the second loop needs a established sub + // and the xml spec allows attributes in arbitrary order for (auto const &att : attributes) { if (iri_equal_pieces(PredicateState::list_start_element, att.uri(), att.local_name())) { out.add_error(ParsingError::Type::BadSyntax, "rdf:li is not allowed as attribute", info); @@ -97,26 +99,26 @@ namespace rdf4cpp::parser::xml_states { out.add_error(ParsingError::Type::BadSyntax, "expected only one of rdf:parseType, rdf:nodeID and rdf:resource", info); } if (datatype.has_value()) { - return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, *datatype); + return StateTransition(std::in_place_type, inherited_attribute_info, subject, predicate, reify, *datatype); } else if (sub.has_value()) { out.add_statement(subject, predicate, *sub, reify); - return StateTransition(std::in_place_type_t{}); // predicate is expected to be empty, object defined as attribute + return StateTransition(std::in_place_type); // predicate is expected to be empty, object defined as attribute // example: https://www.w3.org/2013/RDFXMLTests/rdfms-empty-property-elements/test013.rdf } else if (parse_resource) { Node const obj = out.make_bn(std::nullopt, info); out.add_statement(subject, predicate, obj, reify); - return StateTransition(std::in_place_type_t{}, inherited_attribute_info, obj); + return StateTransition(std::in_place_type, inherited_attribute_info, obj); } else if (parse_literal) { - return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify, info); + return StateTransition(std::in_place_type, inherited_attribute_info, subject, predicate, reify, info); } else if (parse_collection) { - return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); + return StateTransition(std::in_place_type, inherited_attribute_info, subject, predicate, reify); } else { - return StateTransition(std::in_place_type_t{}, inherited_attribute_info, subject, predicate, reify); + return StateTransition(std::in_place_type, inherited_attribute_info, subject, predicate, reify); } } StateTransition DescriptionState::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] XMLStateInfo const &info) { - return StateTransition{std::in_place_type_t{}}; + return StateTransition{std::in_place_type}; } void DescriptionState::move_to(BaseState *b) noexcept { new (b) DescriptionState(std::move(*this)); @@ -194,7 +196,7 @@ namespace rdf4cpp::parser::xml_states { } } return { - StateTransition{std::in_place_type_t{}, inherited_attribute_info, sub}, + StateTransition{std::in_place_type, inherited_attribute_info, sub}, sub, }; } diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp index 26a53783..4bd4e50e 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserEmptyElement.cpp @@ -16,7 +16,7 @@ namespace rdf4cpp::parser::xml_states { } StateTransition EmptyElement::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] XMLStateInfo const &info) { - return StateTransition{std::in_place_type_t{}}; + return StateTransition{std::in_place_type}; } void EmptyElement::move_to(BaseState *b) noexcept { new (b) EmptyElement(std::move(*this)); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp index 56c24f3e..287677d2 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserInitialState.cpp @@ -13,7 +13,7 @@ namespace rdf4cpp::parser::xml_states { StateTransition InitialState::on_start_element(XMLOutputQueue &out, std::string_view const local_name, std::string_view const uri, [[maybe_unused]] std::span attributes, XMLStateInfo const &info) { if (iri_equal_pieces(RDFState::start_element, uri, local_name)) { return StateTransition{ - std::in_place_type_t{}, + std::in_place_type, get_inherited_attributes(out, attributes, info), }; } diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp index ad7a5dc8..a39c5c49 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserPredicateState.cpp @@ -35,7 +35,7 @@ namespace rdf4cpp::parser::xml_states { Literal const lit = out.make_literal(literal, std::nullopt, std::nullopt, info); out.add_statement(subject, predicate, lit, reify); } - return StateTransition{std::in_place_type_t{}}; + return StateTransition{std::in_place_type}; } void PredicateState::move_to(BaseState *b) noexcept { new (b) PredicateState(std::move(*this)); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp index b44f5691..81884b0a 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserRDFState.cpp @@ -16,7 +16,7 @@ namespace rdf4cpp::parser::xml_states { } StateTransition RDFState::on_end_element([[maybe_unused]] XMLOutputQueue &out, [[maybe_unused]] XMLStateInfo const &info) { - return StateTransition{std::in_place_type_t{}}; + return StateTransition{std::in_place_type}; } void RDFState::move_to(BaseState *b) noexcept { new (b) RDFState(std::move(*this)); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp index 62fb99dc..d2c1c235 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.cpp @@ -13,7 +13,7 @@ namespace rdf4cpp::parser::xml_states { Literal const lit = out.make_literal(literal, datatype, std::nullopt, info); out.add_statement(subject, predicate, lit, reify); } - return StateTransition{std::in_place_type_t{}}; + return StateTransition{std::in_place_type}; } void TypedLiteralPredicateState::move_to(BaseState *b) noexcept { new (b) TypedLiteralPredicateState(std::move(*this)); diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp index 871209e9..509361af 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.cpp @@ -21,9 +21,12 @@ namespace rdf4cpp::parser::xml_states { return {}; } IRI datatype = out.make_hardcoded_iri("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"); + // filter out the parts of the source that are not part of the literal std::string_view l = literal; l = l.substr(0, last_offset); l.remove_prefix(data_start); + // filter out the end of the start tag + // this tag belongs to the predicate if (!l.empty() && l[0] == '/') { l.remove_prefix(1); } @@ -32,13 +35,14 @@ namespace rdf4cpp::parser::xml_states { } Literal const lit = out.make_literal(l, datatype, std::nullopt, info); out.add_statement(subject, predicate, lit, reify); - return StateTransition{std::in_place_type_t{}}; + return StateTransition{std::in_place_type}; } void XMLLiteralState::move_to(BaseState *b) noexcept { new (b) XMLLiteralState(std::move(*this)); } void XMLLiteralState::source_input(XMLStateInfo const &info) { + // collect all the different source parts and append them int const off = info.source_offset; std::string_view const sv = info.source; if (literal.empty()) { diff --git a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp index 7ffbdfff..96c7acba 100644 --- a/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp +++ b/private/rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp @@ -13,6 +13,9 @@ namespace rdf4cpp::parser::xml_states { * * * + * + * note: + * this implementation does not match the specification exactly (omitting namespaces) */ struct XMLLiteralState final : PredicateState { StateTransition on_characters(XMLOutputQueue &out, std::string_view chars, XMLStateInfo const &info) override;