1+ #include < rdf4cpp/parser/XMLParser.hpp>
2+
3+ #include < rdf4cpp/parser/XMLParserStateTransition.hpp>
4+
5+ #include < ranges>
6+
7+ namespace rdf4cpp ::parser {
8+ xmlSAXHandler IStreamQuadIterator::ImplXML::make_sax_handler () {
9+ xmlSAXHandler r{};
10+ std::memset (&r, 0 , sizeof (xmlSAXHandler));
11+ r.initialized = XML_SAX2_MAGIC;
12+ r.getParameterEntity = get_entity;
13+ r.getEntity = get_entity;
14+ r.characters = on_characters;
15+ r.startElementNs = on_start_element;
16+ r.endElementNs = on_end_element;
17+ r.warning = on_error;
18+ r.error = on_error;
19+ return r;
20+ }
21+
22+ void IStreamQuadIterator::ImplXML::handle_state_transition (StateTransition transition) {
23+ dice::template_library::match (std::move (transition.modify_state ),
24+ [](NoStateChange) {
25+ // noop
26+ },
27+ [this ](PopState) {
28+ state_stack_.pop_back ();
29+ },
30+ [this ]<typename S>(S &&new_state) {
31+ state_stack_.emplace_back (std::in_place_type<S>, std::forward<S>(new_state));
32+ }
33+ );
34+ }
35+
36+ // implemented here, to have access to states
37+ bool iri_reserved (std::string_view const uri, std::string_view const local_name) {
38+ static constexpr std::array reserved = {
39+ xml_states::RDFState::start_element,
40+ xml_states::DescriptionState::id_attrib,
41+ xml_states::DescriptionState::about_attrib,
42+ xml_states::PredicateState::parse_type_attrib,
43+ xml_states::PredicateState::resource_attrib,
44+ xml_states::DescriptionState::node_id_attrib,
45+ xml_states::TypedLiteralPredicateState::datatype_attrib,
46+ xml_states::BaseState::base_attribute,
47+ xml_states::BaseState::lang_attribute,
48+ std::string_view (" http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach" ),
49+ std::string_view (" http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix" ),
50+ std::string_view (" http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID" ),
51+ };
52+ return std::ranges::any_of (reserved, [&](std::string_view const e) {
53+ return iri_equal_pieces (e, uri, local_name);
54+ });
55+ }
56+ bool iri_core_syntax (std::string_view const uri, std::string_view const local_name) {
57+ static constexpr std::array reserved = {
58+ xml_states::RDFState::start_element,
59+ xml_states::DescriptionState::id_attrib,
60+ xml_states::DescriptionState::about_attrib,
61+ xml_states::PredicateState::parse_type_attrib,
62+ xml_states::PredicateState::resource_attrib,
63+ xml_states::DescriptionState::node_id_attrib,
64+ };
65+ return std::ranges::any_of (reserved, [&](std::string_view const e) {
66+ return iri_equal_pieces (e, uri, local_name);
67+ });
68+ }
69+ bool iri_old_term (std::string_view const uri, std::string_view const local_name) {
70+ static constexpr std::array reserved = {
71+ std::string_view{" http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach" },
72+ std::string_view{" http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix" },
73+ std::string_view{" http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID" },
74+ };
75+ return std::ranges::any_of (reserved, [&](std::string_view const e) {
76+ return iri_equal_pieces (e, uri, local_name);
77+ });
78+ }
79+
80+ void IStreamQuadIterator::ImplXML::on_error (void *th, char const *msg, ...) { // NOLINT(*-dcl50-cpp)
81+ va_list args;
82+ va_list args_copy;
83+ auto t = static_cast <ImplXML *>(th);
84+ va_start (args, msg); // NOLINT(*-pro-bounds-array-to-pointer-decay)
85+ va_copy (args_copy, args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
86+ std::string out{};
87+ out.resize (1 +vsnprintf (nullptr , 0 , msg, args_copy), ' \0 ' ); // NOLINT(*-pro-bounds-array-to-pointer-decay)
88+ auto l = vsnprintf (out.data (), out.size (), msg, args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
89+ if (l > 0 ) {
90+ out.resize (l);
91+ } else {
92+ out = " unknown error, too long to fit" ;
93+ }
94+ t->output_ .add_error (ParsingError::Type::BadSyntax, std::move (out), t->make_info ());
95+ va_end (args); // NOLINT(*-pro-bounds-array-to-pointer-decay)
96+ va_end (args_copy); // NOLINT(*-pro-bounds-array-to-pointer-decay)
97+ }
98+ xmlEntity *IStreamQuadIterator::ImplXML::get_entity (void *, xmlChar const *e) {
99+ return xmlGetPredefinedEntity (e);
100+ }
101+ void IStreamQuadIterator::ImplXML::on_characters (void *th, xmlChar const *e, int const len) {
102+ auto *t = static_cast <ImplXML *>(th);
103+ t->handle_state_transition (t->current_state ().on_characters (t->output_ , from_xml_char (e, len), t->make_info ()));
104+ }
105+ void IStreamQuadIterator::ImplXML::on_start_element (void *th, xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, xmlChar const *uri,
106+ [[maybe_unused]] int n_namespaces, [[maybe_unused]] xmlChar const **namespaces,
107+ int const n_attributes, [[maybe_unused]] int n_defaulted, xmlChar const **attributes) {
108+ auto *t = static_cast <ImplXML *>(th);
109+ t->handle_state_transition (t->current_state ().on_start_element (t->output_ , from_xml_char (local_name), from_xml_char (uri),
110+ std::span{reinterpret_cast <XMLAttribute *>(attributes), static_cast <size_t >(n_attributes)}, t->make_info ()));
111+ }
112+ void IStreamQuadIterator::ImplXML::on_end_element (void *th, [[maybe_unused]] xmlChar const *local_name, [[maybe_unused]] xmlChar const *prefix, [[maybe_unused]] xmlChar const *uri) {
113+ auto *t = static_cast <ImplXML *>(th);
114+ t->handle_state_transition (t->current_state ().on_end_element (t->output_ , t->make_info ()));
115+ }
116+
117+ XMLStateInfo IStreamQuadIterator::ImplXML::make_info () const {
118+ std::string_view base = " " ;
119+ for (auto const &s : state_stack_ | std::views::reverse) {
120+ std::string_view const v = s->base ;
121+ if (!v.empty ()) {
122+ base = v;
123+ break ;
124+ }
125+ }
126+
127+ std::string_view lang_tag = " " ;
128+ for (auto const &s : state_stack_ | std::views::reverse) {
129+ std::string_view const v = s->lang_tag ;
130+ if (!v.empty ()) {
131+ lang_tag = v;
132+ break ;
133+ }
134+ }
135+
136+ xmlChar const *data;
137+ int size = 1024 ;
138+ int off = 0 ;
139+ xmlCtxtGetInputWindow (context_.get (), 0 , &data, &size, &off);
140+ std::string_view const source{reinterpret_cast <char const *>(data), static_cast <size_t >(size)};
141+
142+ return XMLStateInfo{
143+ current_line (),
144+ current_column (),
145+ base,
146+ lang_tag,
147+ source,
148+ off,
149+ };
150+ }
151+
152+ IStreamQuadIterator::ImplXML::ImplXML (void *obj, ReadFunc const read, ErrorFunc const err, EOFFunc const eof, state_type *state)
153+ : handler_(make_sax_handler()),
154+ context_ (xmlCreatePushParserCtxt(&handler_, this , nullptr , 0 , " rdf/xml" )),
155+ reader_obj_(obj), read_func_(read), error_func_(err), eof_func_(eof),
156+ output_(state) {
157+ xmlCtxtSetOptions (context_.get (), XML_PARSE_NOENT | XML_PARSE_PEDANTIC | XML_PARSE_NOCDATA | XML_PARSE_NO_XXE | XML_PARSE_BIG_LINES);
158+ state_stack_.reserve (10 );
159+ state_stack_.emplace_back (std::in_place_type<xml_states::InitialState>);
160+
161+ current_state ().base = output_.current_base_iri ();
162+ }
163+
164+ std::optional<IStreamQuadIterator::value_type> IStreamQuadIterator::ImplXML::next () {
165+ std::array<char , 8192 > buffer; // NOLINT(*-pro-type-member-init)
166+ while (output_.empty () && error_func_ (reader_obj_) == 0 && eof_func_ (reader_obj_) == 0 ) {
167+ auto const read = read_func_ (buffer.data (), sizeof (char ), buffer.size (), reader_obj_);
168+ xmlParseChunk (context_.get (), buffer.data (), static_cast <int >(read), eof_func_ (reader_obj_) != 0 );
169+ }
170+ return output_.next ();
171+ }
172+
173+ uint64_t IStreamQuadIterator::ImplXML::current_line () const noexcept {
174+ return xmlSAX2GetLineNumber (context_.get ());
175+ }
176+
177+ uint64_t IStreamQuadIterator::ImplXML::current_column () const noexcept {
178+ return xmlSAX2GetColumnNumber (context_.get ());
179+ }
180+ } // namespace rdf4cpp::parser
0 commit comments