-
Notifications
You must be signed in to change notification settings - Fork 11
Feature: RDF/XML parser #410
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
45 commits
Select commit
Hold shift + click to select a range
e6f48d1
basic xml parsing
mcb5637 2c92962
api, remove uneccesary graph
mcb5637 105e65d
fix sanitizer
mcb5637 24b3c03
cleanup
mcb5637 4c1b52e
entities
mcb5637 e518b99
more syntax+tests
mcb5637 e112429
base scoping
mcb5637 d040b2e
more tests
mcb5637 cfd7664
parsing state
mcb5637 db888f8
triples in properties
mcb5637 55d2cc9
more tests
mcb5637 ede429f
wip xml literal
mcb5637 ea0a1ae
cleanup
mcb5637 74a2538
collection
mcb5637 48e63c6
reifycation
mcb5637 97327ed
lang tag
mcb5637 b74a806
some cleanup
mcb5637 fb86ba8
list
mcb5637 1f7f0cc
more cleanup
mcb5637 c1eebb0
one parser to rule them all
mcb5637 0098e62
try fix gcc error
mcb5637 9251a7f
another gcc fix
mcb5637 48adc71
cleanup
mcb5637 df06c32
Merge branch 'develop' into feature/parse_xml
mcb5637 a2dbc6f
initial base
mcb5637 dfd809c
xmlliteral tests, cleanup
mcb5637 d64130a
fixes
mcb5637 5e8bac8
inplace poly
mcb5637 40c0f5a
doc
mcb5637 7f4261f
reorganize
mcb5637 c40d4dd
separate states
mcb5637 5ca0c2f
fix gcc14 bug again
mcb5637 efa4374
Merge branch 'develop' into feature/parse_xml
mcb5637 6cf39d5
remove iconv
mcb5637 f8f6454
states no longer get a Impl&
mcb5637 7417925
reorganize classes
mcb5637 5682b8c
prepare for tests
mcb5637 286815a
remove self-referentialness of struct and minor cleanup
liss-h 88d9391
review
mcb5637 dd965a6
Merge remote-tracking branch 'origin/feature/parse_xml' into feature/…
mcb5637 e23492e
fix merge
mcb5637 71638ea
tests from rdf-tests repo
nkaralis 95d11d6
added missing test dep
nkaralis 4c26db9
tests
mcb5637 fc3e9ad
review
mcb5637 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,180 @@ | ||
| #include <rdf4cpp/parser/XMLParser.hpp> | ||
|
|
||
| #include <rdf4cpp/parser/XMLParserStateTransition.hpp> | ||
|
|
||
| #include <ranges> | ||
|
|
||
| namespace rdf4cpp::parser { | ||
| xmlSAXHandler IStreamQuadIterator::ImplXML::make_sax_handler() { | ||
| xmlSAXHandler r{}; | ||
| std::memset(&r, 0, sizeof(xmlSAXHandler)); | ||
| r.initialized = XML_SAX2_MAGIC; | ||
| r.getParameterEntity = get_entity; | ||
| r.getEntity = get_entity; | ||
| r.characters = on_characters; | ||
| r.startElementNs = on_start_element; | ||
| r.endElementNs = on_end_element; | ||
| r.warning = on_error; | ||
| r.error = on_error; | ||
| return r; | ||
| } | ||
|
|
||
| void IStreamQuadIterator::ImplXML::handle_state_transition(StateTransition transition) { | ||
| dice::template_library::match(std::move(transition.modify_state), | ||
| [](NoStateChange) { | ||
| // noop | ||
| }, | ||
| [this](PopState) { | ||
| state_stack_.pop_back(); | ||
| }, | ||
| [this]<typename S>(S &&new_state) { | ||
| state_stack_.emplace_back(std::in_place_type<S>, std::forward<S>(new_state)); | ||
| } | ||
| ); | ||
| } | ||
|
|
||
| // implemented here, to have access to states | ||
| bool iri_reserved(std::string_view const uri, std::string_view const local_name) { | ||
| static constexpr std::array reserved = { | ||
| xml_states::RDFState::start_element, | ||
| xml_states::DescriptionState::id_attrib, | ||
| xml_states::DescriptionState::about_attrib, | ||
| xml_states::PredicateState::parse_type_attrib, | ||
| xml_states::PredicateState::resource_attrib, | ||
| xml_states::DescriptionState::node_id_attrib, | ||
| xml_states::TypedLiteralPredicateState::datatype_attrib, | ||
| xml_states::BaseState::base_attribute, | ||
| xml_states::BaseState::lang_attribute, | ||
| std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"), | ||
| std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"), | ||
| std::string_view("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"), | ||
| }; | ||
| return std::ranges::any_of(reserved, [&](std::string_view const e) { | ||
| return iri_equal_pieces(e, uri, local_name); | ||
| }); | ||
| } | ||
| bool iri_core_syntax(std::string_view const uri, std::string_view const local_name) { | ||
| static constexpr std::array reserved = { | ||
| xml_states::RDFState::start_element, | ||
| xml_states::DescriptionState::id_attrib, | ||
| xml_states::DescriptionState::about_attrib, | ||
| xml_states::PredicateState::parse_type_attrib, | ||
| xml_states::PredicateState::resource_attrib, | ||
| xml_states::DescriptionState::node_id_attrib, | ||
| }; | ||
| return std::ranges::any_of(reserved, [&](std::string_view const e) { | ||
| return iri_equal_pieces(e, uri, local_name); | ||
| }); | ||
| } | ||
| bool iri_old_term(std::string_view const uri, std::string_view const local_name) { | ||
| static constexpr std::array reserved = { | ||
| std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"}, | ||
| std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"}, | ||
| std::string_view{"http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"}, | ||
| }; | ||
| return std::ranges::any_of(reserved, [&](std::string_view const e) { | ||
| return iri_equal_pieces(e, uri, local_name); | ||
| }); | ||
| } | ||
|
|
||
| void IStreamQuadIterator::ImplXML::on_error(void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp) | ||
| va_list args; | ||
| va_list args_copy; | ||
| auto t = static_cast<ImplXML *>(th); | ||
| va_start(args, msg); // NOLINT(*-pro-bounds-array-to-pointer-decay) | ||
| va_copy(args_copy, args); // NOLINT(*-pro-bounds-array-to-pointer-decay) | ||
| std::string out{}; | ||
| out.resize(1+vsnprintf(nullptr, 0, msg, args_copy), '\0'); // NOLINT(*-pro-bounds-array-to-pointer-decay) | ||
| auto l = vsnprintf(out.data(), out.size(), msg, args); // NOLINT(*-pro-bounds-array-to-pointer-decay) | ||
|
mcb5637 marked this conversation as resolved.
|
||
| if (l > 0) { | ||
| out.resize(l); | ||
| } else { | ||
| out = "unknown error, too long to fit"; | ||
| } | ||
| t->output_.add_error(ParsingError::Type::BadSyntax, std::move(out), t->make_info()); | ||
| va_end(args); // NOLINT(*-pro-bounds-array-to-pointer-decay) | ||
| va_end(args_copy); // NOLINT(*-pro-bounds-array-to-pointer-decay) | ||
| } | ||
| xmlEntity *IStreamQuadIterator::ImplXML::get_entity(void *, xmlChar const *e) { | ||
| return xmlGetPredefinedEntity(e); | ||
| } | ||
| void IStreamQuadIterator::ImplXML::on_characters(void *th, xmlChar const *e, int const len) { | ||
| auto *t = static_cast<ImplXML *>(th); | ||
| t->handle_state_transition(t->current_state().on_characters(t->output_, from_xml_char(e, len), t->make_info())); | ||
| } | ||
| void IStreamQuadIterator::ImplXML::on_start_element(void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri, | ||
| [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces, | ||
| int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) { | ||
| auto *t = static_cast<ImplXML *>(th); | ||
| t->handle_state_transition(t->current_state().on_start_element(t->output_, from_xml_char(local_name), from_xml_char(uri), | ||
| std::span{reinterpret_cast<XMLAttribute *>(attributes), static_cast<size_t>(n_attributes)}, t->make_info())); | ||
| } | ||
| void IStreamQuadIterator::ImplXML::on_end_element(void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) { | ||
| auto *t = static_cast<ImplXML *>(th); | ||
| t->handle_state_transition(t->current_state().on_end_element(t->output_, t->make_info())); | ||
| } | ||
|
|
||
| XMLStateInfo IStreamQuadIterator::ImplXML::make_info() const { | ||
| std::string_view base = ""; | ||
| for (auto const &s : state_stack_ | std::views::reverse) { | ||
| std::string_view const v = s->base; | ||
| if (!v.empty()) { | ||
| base = v; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| std::string_view lang_tag = ""; | ||
| for (auto const &s : state_stack_ | std::views::reverse) { | ||
| std::string_view const v = s->lang_tag; | ||
| if (!v.empty()) { | ||
| lang_tag = v; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| xmlChar const *data; | ||
| int size = 1024; | ||
| int off = 0; | ||
| xmlCtxtGetInputWindow(context_.get(), 0, &data, &size, &off); | ||
| std::string_view const source{reinterpret_cast<char const *>(data), static_cast<size_t>(size)}; | ||
|
|
||
| return XMLStateInfo{ | ||
| current_line(), | ||
| current_column(), | ||
| base, | ||
| lang_tag, | ||
| source, | ||
| off, | ||
| }; | ||
| } | ||
|
|
||
| IStreamQuadIterator::ImplXML::ImplXML(void *obj, ReadFunc const read, ErrorFunc const err, EOFFunc const eof, state_type *state) | ||
| : handler_(make_sax_handler()), | ||
| context_(xmlCreatePushParserCtxt(&handler_, this, nullptr, 0, "rdf/xml")), | ||
| reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof), | ||
| output_(state) { | ||
| xmlCtxtSetOptions(context_.get(), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES); | ||
| state_stack_.reserve(10); | ||
| state_stack_.emplace_back(std::in_place_type<xml_states::InitialState>); | ||
|
|
||
| current_state().base = output_.current_base_iri(); | ||
| } | ||
|
|
||
| std::optional<IStreamQuadIterator::value_type> IStreamQuadIterator::ImplXML::next() { | ||
| std::array<char, 8192> buffer; // NOLINT(*-pro-type-member-init) | ||
| while (output_.empty() && error_func_(reader_obj_) == 0 && eof_func_(reader_obj_) == 0) { | ||
| auto const read = read_func_(buffer.data(), sizeof(char), buffer.size(), reader_obj_); | ||
| xmlParseChunk(context_.get(), buffer.data(), static_cast<int>(read), eof_func_(reader_obj_) != 0); | ||
| } | ||
| return output_.next(); | ||
| } | ||
|
|
||
| uint64_t IStreamQuadIterator::ImplXML::current_line() const noexcept { | ||
| return xmlSAX2GetLineNumber(context_.get()); | ||
| } | ||
|
|
||
| uint64_t IStreamQuadIterator::ImplXML::current_column() const noexcept { | ||
| return xmlSAX2GetColumnNumber(context_.get()); | ||
| } | ||
| } // namespace rdf4cpp::parser | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| #ifndef RDF4CPP_XMLPARSER_H | ||
| #define RDF4CPP_XMLPARSER_H | ||
|
|
||
| #include <rdf4cpp/Expected.hpp> | ||
| #include <rdf4cpp/Quad.hpp> | ||
| #include <rdf4cpp/IRIFactory.hpp> | ||
| #include <rdf4cpp/parser/IStreamQuadIterator.hpp> | ||
| #include <rdf4cpp/parser/XMLParserUtility.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserBaseState.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserInitialState.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserRDFState.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserDescriptionState.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserPredicateState.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserTypedLiteralPredicateState.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserXMLLiteralState.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserCollectionState.hpp> | ||
| #include <rdf4cpp/parser/XMLStates/XMLParserEmptyElement.hpp> | ||
|
|
||
| #include <dice/sparse-map/sparse_set.hpp> | ||
| #include <dice/template-library/inplace_polymorphic.hpp> | ||
|
|
||
| #include <libxml/parser.h> | ||
|
|
||
| #include <memory> | ||
| #include <vector> | ||
|
|
||
| namespace rdf4cpp::parser { | ||
| struct IStreamQuadIterator::ImplXML final : Impl { | ||
| private: | ||
| xmlSAXHandler handler_; | ||
| // workaround for gcc-14 bug, erroneously warns on unsing a lambda here | ||
| // see https://github.com/NVIDIA/stdexec/issues/1143 | ||
| struct XmlParserCtxtDtorLambda { | ||
| void operator()(xmlParserCtxt *c) const { | ||
| xmlFreeParserCtxt(c); | ||
| } | ||
| }; | ||
| std::unique_ptr<xmlParserCtxt, XmlParserCtxtDtorLambda> context_; | ||
| void *reader_obj_; | ||
| ReadFunc read_func_; | ||
| ErrorFunc error_func_; | ||
| EOFFunc eof_func_; | ||
| XMLOutputQueue output_; | ||
|
|
||
| using State = dice::template_library::inplace_polymorphic<xml_states::BaseState, | ||
| xml_states::InitialState, xml_states::RDFState, | ||
| xml_states::DescriptionState, xml_states::PredicateState, | ||
| xml_states::TypedLiteralPredicateState, xml_states::EmptyElement, | ||
| xml_states::XMLLiteralState, xml_states::CollectionState>; | ||
|
|
||
| std::vector<State> state_stack_; // Note: we use a vector because std::stack does not have .reserve() | ||
|
|
||
| [[nodiscard]] xml_states::BaseState const ¤t_state() const noexcept { | ||
| return *state_stack_.back(); | ||
| } | ||
|
|
||
| [[nodiscard]] xml_states::BaseState ¤t_state() noexcept { | ||
| return *state_stack_.back(); | ||
| } | ||
|
|
||
| static xmlSAXHandler make_sax_handler(); | ||
|
|
||
| void handle_state_transition(StateTransition transition); | ||
|
|
||
| static void on_error(void *th, char const *msg, ...); | ||
| static xmlEntity *get_entity(void *th, xmlChar const *e); | ||
| static void on_characters(void *th, xmlChar const *e, int len); | ||
| static void on_start_element(void *th, xmlChar const *local_name, xmlChar const *prefix, xmlChar const *uri, | ||
| int n_namespaces, xmlChar const **namespaces, | ||
| int n_attributes, int n_defaulted, xmlChar const **attributes); | ||
| static void on_end_element(void *th, xmlChar const *local_name, xmlChar const *prefix, xmlChar const *uri); | ||
|
|
||
| [[nodiscard]] XMLStateInfo make_info() const; | ||
|
|
||
| public: | ||
| ImplXML(void *obj, ReadFunc read, ErrorFunc err, EOFFunc eof, state_type *state); | ||
|
|
||
| ImplXML(ImplXML const &) = delete; | ||
| ImplXML &operator=(ImplXML const &) = delete; | ||
| ImplXML(ImplXML &&) = delete; | ||
| ImplXML &operator=(ImplXML &&) = delete; | ||
| ~ImplXML() override = default; | ||
|
|
||
| [[nodiscard]] std::optional<value_type> next() override; | ||
|
|
||
| [[nodiscard]] uint64_t current_line() const noexcept override; | ||
| [[nodiscard]] uint64_t current_column() const noexcept override; | ||
| }; | ||
| } // namespace rdf4cpp::parser | ||
|
|
||
| #endif //RDF4CPP_XMLPARSER_H |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.