From 5143031af3c8def7482b2f516983826f3b4f701a Mon Sep 17 00:00:00 2001 From: bigerl Date: Sun, 1 Mar 2026 19:18:34 +0100 Subject: [PATCH 1/7] initial implementation --- CMakeLists.txt | 3 +- .../parser/IStreamQuadIteratorSerdImpl.hpp | 5 +- src/rdf4cpp/parser/FormatGuess.cpp | 498 ++++++++++++++++++ src/rdf4cpp/parser/FormatGuess.hpp | 64 +++ src/rdf4cpp/parser/IStreamQuadIterator.cpp | 67 ++- src/rdf4cpp/parser/IStreamQuadIterator.hpp | 11 + src/rdf4cpp/parser/ParsingFlags.hpp | 30 +- src/rdf4cpp/parser/RDFFileParser.cpp | 49 +- src/rdf4cpp/parser/RDFFileParser.hpp | 2 + tests/CMakeLists.txt | 15 + tests/parser/tests_FormatGuess.cpp | 457 ++++++++++++++++ tests/parser/tests_FormatGuess_realworld.cpp | 322 +++++++++++ tests/parser/tests_IStreamQuadIterator.cpp | 2 +- 13 files changed, 1506 insertions(+), 19 deletions(-) create mode 100644 src/rdf4cpp/parser/FormatGuess.cpp create mode 100644 src/rdf4cpp/parser/FormatGuess.hpp create mode 100644 tests/parser/tests_FormatGuess.cpp create mode 100644 tests/parser/tests_FormatGuess_realworld.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f8cfb4a9..988aeddb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.22) -project(rdf4cpp VERSION 0.1.13) +project(rdf4cpp VERSION 0.2.0) set(POBR_VERSION 3) # Persisted Object Binary Representation include(cmake/boilerplate_init.cmake) @@ -129,6 +129,7 @@ add_library(rdf4cpp src/rdf4cpp/datatypes/xsd/time/DayTimeDuration.cpp src/rdf4cpp/datatypes/xsd/time/YearMonthDuration.cpp src/rdf4cpp/namespaces/RDF.cpp + src/rdf4cpp/parser/FormatGuess.cpp src/rdf4cpp/parser/IStreamQuadIterator.cpp src/rdf4cpp/parser/RDFFileParser.cpp src/rdf4cpp/query/QuadPattern.cpp diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp index 21b4bd7ab..6543deda4 100644 --- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp +++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp @@ -57,6 +57,8 @@ struct IStreamQuadIterator::ImplSerd final : Impl { static constexpr SerdSyntax extract_syntax_from_flags(ParsingFlags flags) noexcept { switch (flags.get_syntax()) { + case ParsingFlag::Turtle: + return SerdSyntax::SERD_TURTLE; case ParsingFlag::TriG: return SerdSyntax::SERD_TRIG; case ParsingFlag::NTriples: @@ -64,7 +66,8 @@ struct IStreamQuadIterator::ImplSerd final : Impl { case ParsingFlag::NQuads: return SerdSyntax::SERD_NQUADS; default: - return SerdSyntax::SERD_TURTLE; + // Auto, RdfXml, OwlXml, JsonLd should be resolved before reaching here + RDF4CPP_UNREACHABLE; } } diff --git a/src/rdf4cpp/parser/FormatGuess.cpp b/src/rdf4cpp/parser/FormatGuess.cpp new file mode 100644 index 000000000..c907d69e8 --- /dev/null +++ b/src/rdf4cpp/parser/FormatGuess.cpp @@ -0,0 +1,498 @@ +#include "FormatGuess.hpp" + +#include +#include +#include + +namespace rdf4cpp::parser { + + // --- helpers --- + + static std::string to_lower(std::string_view sv) { + std::string s{sv}; + std::ranges::transform(s, s.begin(), [](unsigned char c) { + return std::tolower(c); + }); + return s; + } + + static std::string_view skip_whitespace_and_bom(std::string_view sv) { + // skip UTF-8 BOM + if (sv.size() >= 3 && sv[0] == '\xEF' && sv[1] == '\xBB' && sv[2] == '\xBF') { + sv.remove_prefix(3); + } + // skip leading whitespace + while (!sv.empty() && (sv.front() == ' ' || sv.front() == '\t' || sv.front() == '\n' || sv.front() == '\r')) { + sv.remove_prefix(1); + } + return sv; + } + + static bool starts_with_icase(std::string_view haystack, std::string_view needle) { + if (haystack.size() < needle.size()) { + return false; + } + for (size_t i = 0; i < needle.size(); ++i) { + if (std::tolower(static_cast(haystack[i])) != std::tolower(static_cast(needle[i]))) { + return false; + } + } + return true; + } + + static bool contains(std::string_view haystack, std::string_view needle) { + return haystack.find(needle) != std::string_view::npos; + } + + static bool contains_icase(std::string_view haystack, std::string_view needle) { + if (needle.size() > haystack.size()) { + return false; + } + auto lower_hay = to_lower(haystack); + auto lower_needle = to_lower(needle); + return lower_hay.find(lower_needle) != std::string::npos; + } + + // --- extension mapping --- + + FormatGuess guess_format_from_extension(std::string_view extension) noexcept { + auto ext = to_lower(extension); + + if (ext == ".ttl" || ext == ".turtle") { + return {ParsingFlag::Turtle, GuessConfidence::High}; + } + if (ext == ".nt" || ext == ".ntriples") { + return {ParsingFlag::NTriples, GuessConfidence::High}; + } + if (ext == ".nq" || ext == ".nquads") { + return {ParsingFlag::NQuads, GuessConfidence::High}; + } + if (ext == ".trig") { + return {ParsingFlag::TriG, GuessConfidence::High}; + } + if (ext == ".rdf") { + return {ParsingFlag::RdfXml, GuessConfidence::High}; + } + if (ext == ".owx") { + return {ParsingFlag::OwlXml, GuessConfidence::High}; + } + if (ext == ".jsonld") { + return {ParsingFlag::JsonLd, GuessConfidence::High}; + } + + // ambiguous extensions — need content sniffing + if (ext == ".owl" || ext == ".xml") { + return {ParsingFlag::RdfXml, GuessConfidence::Low}; + } + + return {ParsingFlag::Auto, GuessConfidence::None}; + } + + FormatGuess guess_format_from_path(std::string_view file_path) noexcept { + // find last path separator + auto const last_sep = file_path.find_last_of("/\\"); + auto const filename = (last_sep != std::string_view::npos) ? file_path.substr(last_sep + 1) : file_path; + + // find last dot in filename + auto const dot_pos = filename.rfind('.'); + if (dot_pos == std::string_view::npos) { + return {ParsingFlag::Auto, GuessConfidence::None}; + } + + return guess_format_from_extension(filename.substr(dot_pos)); + } + + // --- content sniffing --- + + static bool has_trig_markers(std::string_view content) { + // Look for GRAPH keyword or { } blocks outside of string literals + // Simple heuristic: look for GRAPH keyword or standalone { not inside quotes + if (contains_icase(content, "GRAPH")) { + return true; + } + + // Look for pattern like IRI/prefixed-name followed by { + // or just { at start of a line (default graph block) + bool in_string = false; + char string_delim = 0; + for (size_t i = 0; i < content.size(); ++i) { + char c = content[i]; + if (in_string) { + if (c == '\\' && i + 1 < content.size()) { + ++i; // skip escaped char + continue; + } + if (c == string_delim) { + in_string = false; + } + continue; + } + if (c == '"' || c == '\'') { + in_string = true; + string_delim = c; + continue; + } + if (c == '{') { + return true; + } + } + return false; + } + + static FormatGuess sniff_xml_content(std::string_view content) { + // Check for OWL/XML first — more specific markers. + // OWL/XML uses root element (not ) and may still + // declare xmlns:rdf as a namespace prefix, so checking OWL/XML before + // RDF/XML avoids false positives. + bool has_ontology_root = contains(content, " . (3 terms = NT, 4 terms = NQ) + bool found_4_terms = false; + bool found_any_triple = false; + + size_t pos = 0; + while (pos < content.size()) { + // find end of line + auto eol = content.find('\n', pos); + auto line = content.substr(pos, eol == std::string_view::npos ? std::string_view::npos : eol - pos); + pos = (eol == std::string_view::npos) ? content.size() : eol + 1; + + // trim + while (!line.empty() && (line.front() == ' ' || line.front() == '\t')) { + line.remove_prefix(1); + } + while (!line.empty() && (line.back() == ' ' || line.back() == '\t' || line.back() == '\r')) { + line.remove_suffix(1); + } + + // skip empty lines and comments + if (line.empty() || line.front() == '#') { + continue; + } + + // Count terms by walking through the line, handling # comments + // that appear outside of IRIs and literals correctly. + int term_count = 0; + bool found_dot = false; + size_t i = 0; + while (i < line.size()) { + // skip whitespace + while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) { + ++i; + } + if (i >= line.size()) { + break; + } + char c = line[i]; + + if (c == '.') { + found_dot = true; + ++i; + // skip trailing whitespace and optional comment after . + while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) { + ++i; + } + if (i < line.size() && line[i] == '#') { + // inline comment after dot — valid + } + break; + } else if (c == '#') { + // comment at top level (outside IRI/literal) — not valid N-Triples + // unless we already found the dot + return {ParsingFlag::Auto, GuessConfidence::None}; + } else if (c == '<') { + // IRI — find closing > + auto close = line.find('>', i); + if (close == std::string_view::npos) { + break; + } + i = close + 1; + ++term_count; + } else if (c == '_' && i + 1 < line.size() && line[i + 1] == ':') { + // blank node — skip to next whitespace + while (i < line.size() && line[i] != ' ' && line[i] != '\t') { + ++i; + } + ++term_count; + } else if (c == '"') { + // literal — find unescaped closing quote, then skip datatype/lang + ++i; + while (i < line.size()) { + if (line[i] == '\\') { + i += 2; + continue; + } + if (line[i] == '"') { + break; + } + ++i; + } + if (i < line.size()) { + ++i; // skip closing quote + } + // skip ^^ or @lang (which may contain # inside <...>) + if (i + 1 < line.size() && line[i] == '^' && line[i + 1] == '^') { + i += 2; + if (i < line.size() && line[i] == '<') { + auto close = line.find('>', i); + if (close != std::string_view::npos) { + i = close + 1; + } + } else { + while (i < line.size() && line[i] != ' ' && line[i] != '\t' && line[i] != '.') { + ++i; + } + } + } else if (i < line.size() && line[i] == '@') { + while (i < line.size() && line[i] != ' ' && line[i] != '\t' && line[i] != '.') { + ++i; + } + } + ++term_count; + } else { + // unexpected character for N-Triples/N-Quads + return {ParsingFlag::Auto, GuessConfidence::None}; + } + } + + if (!found_dot) { + return {ParsingFlag::Auto, GuessConfidence::None}; + } + if (term_count == 4) { + found_4_terms = true; + } + if (term_count >= 3) { + found_any_triple = true; + } + if (term_count < 3 || term_count > 4) { + return {ParsingFlag::Auto, GuessConfidence::None}; + } + } + + if (!found_any_triple) { + return {ParsingFlag::Auto, GuessConfidence::None}; + } + if (found_4_terms) { + return {ParsingFlag::NQuads, GuessConfidence::Medium}; + } + return {ParsingFlag::NTriples, GuessConfidence::Medium}; + } + + static std::string_view skip_comments(std::string_view sv) { + // skip lines starting with # (comments in N-Triples/Turtle/TriG) + while (!sv.empty() && sv.front() == '#') { + auto eol = sv.find('\n'); + if (eol == std::string_view::npos) { + return {}; + } + sv.remove_prefix(eol + 1); + // skip whitespace after comment line + while (!sv.empty() && (sv.front() == ' ' || sv.front() == '\t' || sv.front() == '\n' || sv.front() == '\r')) { + sv.remove_prefix(1); + } + } + return sv; + } + + FormatGuess guess_format_from_content(std::string_view prefix) noexcept { + auto full_content = skip_whitespace_and_bom(prefix); + if (full_content.empty()) { + return {ParsingFlag::Auto, GuessConfidence::None}; + } + + // Skip leading comment lines for the first-byte checks + auto content = skip_comments(full_content); + if (content.empty()) { + return {ParsingFlag::Auto, GuessConfidence::None}; + } + + // Phase 1: deterministic checks + + // XML-based formats + if (content.starts_with(" 1 && content[1] == ':')) { + auto result = sniff_ntriples_or_nquads(full_content); + if (result.is_known()) { + return result; + } + // If N-Triples detection failed, the content starts with `<` or `_:` + // which are valid Turtle starts too — fall through to Phase 3 + } + + // Phase 3: check for Turtle/TriG syntax markers in content that didn't + // match any earlier patterns (e.g. Turtle without @prefix directives) + { + bool has_turtle_marker = false; + bool in_iri = false; + bool in_string = false; + char string_delim = 0; + + for (size_t i = 0; i < content.size(); ++i) { + char c = content[i]; + if (in_string) { + if (c == '\\' && i + 1 < content.size()) { + ++i; + continue; + } + if (c == string_delim) { + in_string = false; + } + continue; + } + if (in_iri) { + if (c == '>') { + in_iri = false; + } + continue; + } + if (c == '<') { + in_iri = true; + continue; + } + if (c == '"' || c == '\'') { + in_string = true; + string_delim = c; + continue; + } + // Turtle/TriG syntax markers not valid in N-Triples + if (c == ';' || c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}') { + has_turtle_marker = true; + break; + } + // bare `a` surrounded by whitespace = Turtle shorthand for rdf:type + if (c == 'a' && i > 0 && (content[i - 1] == ' ' || content[i - 1] == '\t') && i + 1 < content.size() + && (content[i + 1] == ' ' || content[i + 1] == '\t')) + { + has_turtle_marker = true; + break; + } + } + + if (has_turtle_marker) { + if (has_trig_markers(full_content)) { + return {ParsingFlag::TriG, GuessConfidence::Low}; + } + return {ParsingFlag::Turtle, GuessConfidence::Low}; + } + } + + return {ParsingFlag::Auto, GuessConfidence::None}; + } + + FormatGuess guess_format(std::string_view file_path, std::string_view prefix) noexcept { + auto ext_guess = guess_format_from_path(file_path); + auto content_guess = guess_format_from_content(prefix); + + // If extension gives a strong match and no content sniffing needed + if (ext_guess.confidence == GuessConfidence::High) { + // Check if content agrees for Certain confidence + if (content_guess.is_known() && content_guess.syntax == ext_guess.syntax) { + return {ext_guess.syntax, GuessConfidence::Certain}; + } + // Extension is high confidence — trust it even if content is ambiguous + return ext_guess; + } + + // Low confidence extension (e.g. .owl, .xml) — need content disambiguation + if (ext_guess.confidence == GuessConfidence::Low) { + if (content_guess.is_known()) { + // Content overrides ambiguous extension + return content_guess; + } + // Content inconclusive, use extension guess + return ext_guess; + } + + // No extension match — rely on content + if (content_guess.is_known()) { + return content_guess; + } + + return {ParsingFlag::Auto, GuessConfidence::None}; + } + +} // namespace rdf4cpp::parser diff --git a/src/rdf4cpp/parser/FormatGuess.hpp b/src/rdf4cpp/parser/FormatGuess.hpp new file mode 100644 index 000000000..268734e7d --- /dev/null +++ b/src/rdf4cpp/parser/FormatGuess.hpp @@ -0,0 +1,64 @@ +#ifndef RDF4CPP_PARSER_FORMATGUESS_HPP +#define RDF4CPP_PARSER_FORMATGUESS_HPP + +#include +#include + +#include + +namespace rdf4cpp::parser { + + /** + * Confidence level for an RDF format guess, ordered from least to most confident. + */ + enum struct GuessConfidence : uint8_t { + None = 0, ///< no guess could be made + Low, ///< weak heuristic match (e.g. ambiguous extension like .owl) + Medium, ///< content sniffing with good signal + High, ///< file extension match or strong content match + Certain, ///< unambiguous (extension + content agree) + }; + + /** + * Result of an RDF serialization format guess, combining the detected syntax with + * a confidence level indicating how reliable the guess is. + */ + struct FormatGuess { + ParsingFlag syntax = ParsingFlag::Auto; + GuessConfidence confidence = GuessConfidence::None; + + [[nodiscard]] constexpr bool is_known() const noexcept { + return syntax != ParsingFlag::Auto && confidence != GuessConfidence::None; + } + + [[nodiscard]] constexpr explicit operator bool() const noexcept { + return is_known(); + } + }; + + /** + * Guess the RDF serialization format from a file extension (including the dot). + * Case-insensitive. Returns {Auto, None} for unrecognized extensions. + */ + [[nodiscard]] FormatGuess guess_format_from_extension(std::string_view extension) noexcept; + + /** + * Extract the file extension from a path and guess the format. + */ + [[nodiscard]] FormatGuess guess_format_from_path(std::string_view file_path) noexcept; + + /** + * Guess the RDF serialization format by inspecting a prefix of the file content. + * At least 512 bytes recommended, 4096 bytes ideal. + */ + [[nodiscard]] FormatGuess guess_format_from_content(std::string_view prefix) noexcept; + + /** + * Combined guess: extension first, content second. + * Confidence boosted to Certain when both agree. + */ + [[nodiscard]] FormatGuess guess_format(std::string_view file_path, std::string_view prefix) noexcept; + +} // namespace rdf4cpp::parser + +#endif // RDF4CPP_PARSER_FORMATGUESS_HPP diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.cpp b/src/rdf4cpp/parser/IStreamQuadIterator.cpp index 1abc99c91..087e58f56 100644 --- a/src/rdf4cpp/parser/IStreamQuadIterator.cpp +++ b/src/rdf4cpp/parser/IStreamQuadIterator.cpp @@ -1,9 +1,12 @@ #include "IStreamQuadIterator.hpp" #include +#include #include #include +#include +#include #if __has_include() #include @@ -11,6 +14,8 @@ namespace rdf4cpp::parser { +static constexpr size_t peek_size = 4096; + /** * Adaptor function so that serd can read from std::istreams. * Matches the interface of SerdSource/fread @@ -51,16 +56,64 @@ static int istream_eof(void *voided_self) noexcept { return static_cast(self->eof()); } +static void throw_if_unsupported(ParsingFlag syntax) { + if (syntax == ParsingFlag::OwlXml) { + throw std::runtime_error("OWL/XML format is not supported. Please convert to RDF/XML or Turtle."); + } + if (syntax == ParsingFlag::JsonLd) { + throw std::runtime_error("JSON-LD format is not supported."); + } +} + +static ParsingFlag resolve_auto_syntax(FormatGuess guess) { + if (guess.is_known()) { + return guess.syntax; + } + // fallback to Turtle when we can't determine the format + return ParsingFlag::Turtle; +} + IStreamQuadIterator::IStreamQuadIterator(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof, flags_type flags, - state_type *state) - : impl{flags.get_syntax() == ParsingFlag::RdfXml ? - static_cast>(std::make_unique(stream, read, error, eof, state)) : - std::make_unique(stream, read, error, flags, state)}, - cur{impl->next()} { + state_type *state) { + auto make_impl = [](void *s, ReadFunc r, ErrorFunc e, EOFFunc ef, + flags_type f, state_type *st) -> std::unique_ptr { + if (f.get_syntax() == ParsingFlag::RdfXml) { + return std::make_unique(s, r, e, ef, st); + } + return std::make_unique(s, r, e, f, st); + }; + + if (flags.get_syntax() == ParsingFlag::Auto) { + // Peek bytes for content sniffing + std::vector buf(peek_size); + size_t const bytes_read = read(buf.data(), 1, peek_size, stream); + buf.resize(bytes_read); + + std::string_view const prefix{buf.data(), buf.size()}; + auto const guess = guess_format_from_content(prefix); + auto const resolved = resolve_auto_syntax(guess); + throw_if_unsupported(resolved); + + detected_format_ = guess; + auto const resolved_flags = flags.with_syntax(resolved); + + // Create a PrefixBufferedReader to replay peeked bytes + buffered_reader_ = std::make_unique(stream, read, error, eof, std::move(buf)); + impl = make_impl(buffered_reader_.get(), + &PrefixBufferedReader::read_func, + &PrefixBufferedReader::error_func, + &PrefixBufferedReader::eof_func, + resolved_flags, state); + } else { + throw_if_unsupported(flags.get_syntax()); + detected_format_ = FormatGuess{flags.get_syntax(), GuessConfidence::Certain}; + impl = make_impl(stream, read, error, eof, flags, state); + } + cur = impl->next(); } IStreamQuadIterator::IStreamQuadIterator(std::istream &istream, @@ -95,6 +148,10 @@ uint64_t IStreamQuadIterator::current_column() const noexcept { return impl->current_column(); } +FormatGuess IStreamQuadIterator::detected_format() const noexcept { + return detected_format_; +} + bool IStreamQuadIterator::operator==(std::default_sentinel_t) const noexcept { return !cur.has_value(); } diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.hpp b/src/rdf4cpp/parser/IStreamQuadIterator.hpp index 47148b070..0ed2a0e3b 100644 --- a/src/rdf4cpp/parser/IStreamQuadIterator.hpp +++ b/src/rdf4cpp/parser/IStreamQuadIterator.hpp @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -15,6 +16,8 @@ namespace rdf4cpp::parser { +struct PrefixBufferedReader; + /** * Identical semantics to fread. * Uses stream to read at most count elements of size element_size into buffer. @@ -107,6 +110,9 @@ struct IStreamQuadIterator { std::unique_ptr impl; std::optional> cur; + FormatGuess detected_format_{}; + std::unique_ptr buffered_reader_; + public: /** * Constructs a IStreamQuadIterator from a C-like io api. That is something similar to @@ -155,6 +161,11 @@ struct IStreamQuadIterator { [[nodiscard]] uint64_t current_line() const noexcept; [[nodiscard]] uint64_t current_column() const noexcept; + /** + * @return the detected format when Auto mode was used, or the explicitly set format + */ + [[nodiscard]] FormatGuess detected_format() const noexcept; + bool operator==(std::default_sentinel_t) const noexcept; bool operator!=(std::default_sentinel_t) const noexcept; }; diff --git a/src/rdf4cpp/parser/ParsingFlags.hpp b/src/rdf4cpp/parser/ParsingFlags.hpp index 6da05a64e..02eb86a99 100644 --- a/src/rdf4cpp/parser/ParsingFlags.hpp +++ b/src/rdf4cpp/parser/ParsingFlags.hpp @@ -8,8 +8,8 @@ namespace rdf4cpp::parser { /** * Note that the syntax flags are mutually exclusive. - * If none is used, Turtle is the default. - * If more than one is used accidentally at the same time, TriG is likely the result (even if it does never get specified). + * If none is used, Auto is the default (auto-detect format from file extension and content). + * If more than one is used accidentally at the same time, the result is undefined. */ enum struct ParsingFlag : uint8_t { Lax = 1 << 0, @@ -17,11 +17,14 @@ enum struct ParsingFlag : uint8_t { KeepBlankNodeIds = 1 << 2, NoParseBlankNode = 1 << 3, - Turtle = 0b00 << 4, // default - NTriples = 0b01 << 4, - NQuads = 0b10 << 4, - TriG = 0b11 << 4, - RdfXml = 0b100 << 4, + Auto = 0b000 << 4, // default — auto-detect format + NTriples = 0b001 << 4, + NQuads = 0b010 << 4, + TriG = 0b011 << 4, + RdfXml = 0b100 << 4, + Turtle = 0b101 << 4, + OwlXml = 0b110 << 4, // detected but not supported + JsonLd = 0b111 << 4, // detected but not supported }; constexpr uint8_t ParsingFlag_SyntaxMask = 0b111 << 4; @@ -66,15 +69,24 @@ struct ParsingFlags { } /** - * @return the syntax ParsingFlag contained in this ParsingFlags. (Turtle if not specified) + * @return the syntax ParsingFlag contained in this ParsingFlags. (Auto if not specified) */ [[nodiscard]] constexpr ParsingFlag get_syntax() const noexcept { return static_cast(flags & static_cast(ParsingFlag_SyntaxMask)); } + /** + * @return a copy of this ParsingFlags with the syntax bits replaced by the given syntax + */ + [[nodiscard]] constexpr ParsingFlags with_syntax(ParsingFlag syntax) const noexcept { + auto new_flags = flags & ~static_cast(ParsingFlag_SyntaxMask); + new_flags |= static_cast(syntax); + return ParsingFlags{static_cast(new_flags)}; + } + [[nodiscard]] constexpr bool syntax_allows_prefixes() const noexcept { auto const syn = get_syntax(); - return syn == ParsingFlag::Turtle || syn == ParsingFlag::TriG; + return syn == ParsingFlag::Turtle || syn == ParsingFlag::TriG || syn == ParsingFlag::Auto; } }; diff --git a/src/rdf4cpp/parser/RDFFileParser.cpp b/src/rdf4cpp/parser/RDFFileParser.cpp index 055a2db0e..3f5d70963 100644 --- a/src/rdf4cpp/parser/RDFFileParser.cpp +++ b/src/rdf4cpp/parser/RDFFileParser.cpp @@ -1,23 +1,62 @@ #include "RDFFileParser.hpp" - +#include #include +#include +#include + namespace rdf4cpp::parser { + +static constexpr size_t peek_size = 4096; + RDFFileParser::RDFFileParser(const std::string &file_path, flags_type flags, state_type *state) : file_path_(file_path), flags_(flags), state_(state) { } RDFFileParser::RDFFileParser(std::string &&file_path, flags_type flags, state_type *state) : file_path_(std::move(file_path)), flags_(flags), state_(state) { } + RDFFileParser::iterator RDFFileParser::begin() const { FILE *stream = fopen_fastseq(file_path_.c_str(), "r"); if (stream == nullptr) { throw std::system_error{errno, std::system_category()}; } - return {std::move(stream), flags_, state_}; + auto flags = flags_; + + if (flags.get_syntax() == ParsingFlag::Auto) { + // Peek content for sniffing + std::vector buf(peek_size); + size_t bytes_read = fread(buf.data(), 1, peek_size, stream); + buf.resize(bytes_read); + + // Rewind the stream so IStreamQuadIterator reads from start + if (fseek(stream, 0, SEEK_SET) != 0) { + fclose(stream); + throw std::runtime_error("Failed to rewind file stream for format detection"); + } + + std::string_view const prefix{buf.data(), buf.size()}; + auto guess = guess_format(file_path_, prefix); + + auto resolved = guess.is_known() ? guess.syntax : ParsingFlag::Turtle; + + if (resolved == ParsingFlag::OwlXml) { + fclose(stream); + throw std::runtime_error("OWL/XML format is not supported. Please convert to RDF/XML or Turtle."); + } + if (resolved == ParsingFlag::JsonLd) { + fclose(stream); + throw std::runtime_error("JSON-LD format is not supported."); + } + + flags = flags.with_syntax(resolved); + } + + return {std::move(stream), flags, state_}; } + std::default_sentinel_t RDFFileParser::end() const noexcept { return {}; } @@ -48,6 +87,12 @@ RDFFileParser::iterator &RDFFileParser::iterator::operator++() { bool RDFFileParser::iterator::operator==(const RDFFileParser::iterator &other) const noexcept { return iter_ == other.iter_; } +FormatGuess RDFFileParser::iterator::detected_format() const noexcept { + if (iter_) { + return iter_->detected_format(); + } + return {}; +} bool operator==(const RDFFileParser::iterator &iter, std::default_sentinel_t s) noexcept { return (*iter.iter_) == s; } diff --git a/src/rdf4cpp/parser/RDFFileParser.hpp b/src/rdf4cpp/parser/RDFFileParser.hpp index a45ef47b9..da1128161 100644 --- a/src/rdf4cpp/parser/RDFFileParser.hpp +++ b/src/rdf4cpp/parser/RDFFileParser.hpp @@ -2,6 +2,7 @@ #define RDF4CPP_RDFFILEPARSER_HPP #include +#include #include namespace rdf4cpp::parser { @@ -70,6 +71,7 @@ struct RDFFileParser { pointer operator->() const noexcept; iterator &operator++(); [[nodiscard]] bool operator==(const iterator &other) const noexcept; + [[nodiscard]] FormatGuess detected_format() const noexcept; // != gets generated by compiler }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2c38a3cd5..4a72658e6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -369,6 +369,21 @@ target_link_libraries(tests_XMLParser ) add_test(NAME tests_XMLParser COMMAND tests_XMLParser) +add_executable(tests_FormatGuess parser/tests_FormatGuess.cpp) +target_link_libraries(tests_FormatGuess + doctest::doctest + rdf4cpp +) +add_test(NAME tests_FormatGuess COMMAND tests_FormatGuess) + +add_executable(tests_FormatGuess_realworld parser/tests_FormatGuess_realworld.cpp) +target_link_libraries(tests_FormatGuess_realworld + doctest::doctest + rdf4cpp + CURL::libcurl +) +add_test(NAME tests_FormatGuess_realworld COMMAND tests_FormatGuess_realworld) + if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.nt") file(DOWNLOAD "https://files.tentris.dev/swdf.zip" "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.zip") execute_process(COMMAND unzip "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.zip" -d "${CMAKE_CURRENT_BINARY_DIR}/test_swdf") diff --git a/tests/parser/tests_FormatGuess.cpp b/tests/parser/tests_FormatGuess.cpp new file mode 100644 index 000000000..135f02594 --- /dev/null +++ b/tests/parser/tests_FormatGuess.cpp @@ -0,0 +1,457 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN + +#include + +#include +#include +#include + +#include + +using namespace rdf4cpp; +using namespace rdf4cpp::parser; + +TEST_SUITE("FormatGuess") { + + TEST_CASE("guess_format_from_extension") { + SUBCASE("known extensions") { + CHECK(guess_format_from_extension(".ttl").syntax == ParsingFlag::Turtle); + CHECK(guess_format_from_extension(".ttl").confidence == GuessConfidence::High); + CHECK(guess_format_from_extension(".turtle").syntax == ParsingFlag::Turtle); + + CHECK(guess_format_from_extension(".nt").syntax == ParsingFlag::NTriples); + CHECK(guess_format_from_extension(".nt").confidence == GuessConfidence::High); + CHECK(guess_format_from_extension(".ntriples").syntax == ParsingFlag::NTriples); + + CHECK(guess_format_from_extension(".nq").syntax == ParsingFlag::NQuads); + CHECK(guess_format_from_extension(".nq").confidence == GuessConfidence::High); + CHECK(guess_format_from_extension(".nquads").syntax == ParsingFlag::NQuads); + + CHECK(guess_format_from_extension(".trig").syntax == ParsingFlag::TriG); + CHECK(guess_format_from_extension(".trig").confidence == GuessConfidence::High); + + CHECK(guess_format_from_extension(".rdf").syntax == ParsingFlag::RdfXml); + CHECK(guess_format_from_extension(".rdf").confidence == GuessConfidence::High); + + CHECK(guess_format_from_extension(".owx").syntax == ParsingFlag::OwlXml); + CHECK(guess_format_from_extension(".owx").confidence == GuessConfidence::High); + + CHECK(guess_format_from_extension(".jsonld").syntax == ParsingFlag::JsonLd); + CHECK(guess_format_from_extension(".jsonld").confidence == GuessConfidence::High); + } + + SUBCASE("ambiguous extensions") { + CHECK(guess_format_from_extension(".owl").confidence == GuessConfidence::Low); + CHECK(guess_format_from_extension(".xml").confidence == GuessConfidence::Low); + } + + SUBCASE("unknown extensions") { + CHECK_FALSE(guess_format_from_extension(".gz").is_known()); + CHECK_FALSE(guess_format_from_extension(".csv").is_known()); + CHECK_FALSE(guess_format_from_extension(".txt").is_known()); + CHECK_FALSE(guess_format_from_extension("").is_known()); + } + + SUBCASE("case insensitive") { + CHECK(guess_format_from_extension(".TTL").syntax == ParsingFlag::Turtle); + CHECK(guess_format_from_extension(".Nt").syntax == ParsingFlag::NTriples); + CHECK(guess_format_from_extension(".NQ").syntax == ParsingFlag::NQuads); + CHECK(guess_format_from_extension(".TRIG").syntax == ParsingFlag::TriG); + CHECK(guess_format_from_extension(".RDF").syntax == ParsingFlag::RdfXml); + CHECK(guess_format_from_extension(".JSONLD").syntax == ParsingFlag::JsonLd); + } + } + + TEST_CASE("guess_format_from_path") { + CHECK(guess_format_from_path("/path/to/file.ttl").syntax == ParsingFlag::Turtle); + CHECK(guess_format_from_path("/some/dir/data.nt").syntax == ParsingFlag::NTriples); + CHECK(guess_format_from_path("file.nq").syntax == ParsingFlag::NQuads); + CHECK(guess_format_from_path("/a/b/c.trig").syntax == ParsingFlag::TriG); + CHECK(guess_format_from_path("ontology.rdf").syntax == ParsingFlag::RdfXml); + CHECK(guess_format_from_path("data.jsonld").syntax == ParsingFlag::JsonLd); + + SUBCASE("no extension") { + CHECK_FALSE(guess_format_from_path("/path/to/file").is_known()); + CHECK_FALSE(guess_format_from_path("").is_known()); + } + + SUBCASE("path separators") { + CHECK(guess_format_from_path("C:\\Users\\data.ttl").syntax == ParsingFlag::Turtle); + CHECK(guess_format_from_path("/home/user/data.ttl").syntax == ParsingFlag::Turtle); + } + } + + TEST_CASE("guess_format_from_content") { + + SUBCASE("N-Triples") { + constexpr char const *nt_content = + " \"object\" .\n" + " .\n"; + auto guess = guess_format_from_content(nt_content); + CHECK(guess.syntax == ParsingFlag::NTriples); + CHECK(guess.is_known()); + } + + SUBCASE("N-Quads") { + constexpr char const *nq_content = + " .\n" + " .\n"; + auto guess = guess_format_from_content(nq_content); + CHECK(guess.syntax == ParsingFlag::NQuads); + CHECK(guess.is_known()); + } + + SUBCASE("Turtle with @prefix") { + constexpr char const *ttl_content = + "@prefix ex: .\n" + "ex:s ex:p ex:o .\n"; + auto guess = guess_format_from_content(ttl_content); + CHECK(guess.syntax == ParsingFlag::Turtle); + CHECK(guess.is_known()); + } + + SUBCASE("Turtle with @base") { + constexpr char const *ttl_content = + "@base .\n" + "

.\n"; + auto guess = guess_format_from_content(ttl_content); + CHECK(guess.syntax == ParsingFlag::Turtle); + CHECK(guess.is_known()); + } + + SUBCASE("Turtle with SPARQL-style PREFIX") { + constexpr char const *ttl_content = + "PREFIX ex: \n" + "ex:s ex:p ex:o .\n"; + auto guess = guess_format_from_content(ttl_content); + CHECK(guess.syntax == ParsingFlag::Turtle); + CHECK(guess.is_known()); + } + + SUBCASE("TriG with GRAPH keyword") { + constexpr char const *trig_content = + "@prefix ex: .\n" + "GRAPH ex:g { ex:s ex:p ex:o . }\n"; + auto guess = guess_format_from_content(trig_content); + CHECK(guess.syntax == ParsingFlag::TriG); + CHECK(guess.is_known()); + } + + SUBCASE("TriG with curly braces") { + constexpr char const *trig_content = + "@prefix ex: .\n" + "ex:g { ex:s ex:p ex:o . }\n"; + auto guess = guess_format_from_content(trig_content); + CHECK(guess.syntax == ParsingFlag::TriG); + CHECK(guess.is_known()); + } + + SUBCASE("RDF/XML with xml declaration") { + constexpr char const *rdfxml_content = + "\n" + "\n" + ""; + auto guess = guess_format_from_content(rdfxml_content); + CHECK(guess.syntax == ParsingFlag::RdfXml); + CHECK(guess.is_known()); + } + + SUBCASE("RDF/XML without xml declaration") { + constexpr char const *rdfxml_content = + "\n" + " \n" + " \n" + ""; + auto guess = guess_format_from_content(rdfxml_content); + CHECK(guess.syntax == ParsingFlag::RdfXml); + CHECK(guess.is_known()); + } + + SUBCASE("OWL/XML detection") { + constexpr char const *owlxml_content = + "\n" + "\n" + " \n" + ""; + auto guess = guess_format_from_content(owlxml_content); + CHECK(guess.syntax == ParsingFlag::OwlXml); + CHECK(guess.is_known()); + } + + SUBCASE("JSON-LD detection") { + constexpr char const *jsonld_content = + "{\n" + " \"@context\": \"http://schema.org/\",\n" + " \"@type\": \"Person\",\n" + " \"name\": \"Jane Doe\"\n" + "}"; + auto guess = guess_format_from_content(jsonld_content); + CHECK(guess.syntax == ParsingFlag::JsonLd); + CHECK(guess.is_known()); + } + + SUBCASE("empty content") { + CHECK_FALSE(guess_format_from_content("").is_known()); + CHECK_FALSE(guess_format_from_content(" ").is_known()); + } + + SUBCASE("BOM handling") { + std::string bom_ttl = "\xEF\xBB\xBF@prefix ex: .\nex:s ex:p ex:o .\n"; + auto guess = guess_format_from_content(bom_ttl); + CHECK(guess.syntax == ParsingFlag::Turtle); + } + + SUBCASE("comment-only N-Triples") { + constexpr char const *nt_content = + "# This is a comment\n" + " .\n"; + auto guess = guess_format_from_content(nt_content); + CHECK(guess.syntax == ParsingFlag::NTriples); + } + + SUBCASE("blank node subject in N-Triples") { + constexpr char const *nt_content = + "_:b1 .\n"; + auto guess = guess_format_from_content(nt_content); + CHECK(guess.syntax == ParsingFlag::NTriples); + } + } + + TEST_CASE("guess_format combined") { + SUBCASE("extension and content agree") { + constexpr char const *ttl_content = "@prefix ex: .\nex:s ex:p ex:o .\n"; + auto guess = guess_format("/path/to/file.ttl", ttl_content); + CHECK(guess.syntax == ParsingFlag::Turtle); + CHECK(guess.confidence == GuessConfidence::Certain); + } + + SUBCASE("extension high, content inconclusive") { + auto guess = guess_format("/path/to/file.ttl", ""); + CHECK(guess.syntax == ParsingFlag::Turtle); + CHECK(guess.confidence == GuessConfidence::High); + } + + SUBCASE("ambiguous extension, content disambiguates to RDF/XML") { + constexpr char const *rdfxml_content = + "\n" + "\n" + ""; + auto guess = guess_format("/path/to/ontology.owl", rdfxml_content); + CHECK(guess.syntax == ParsingFlag::RdfXml); + CHECK(guess.confidence == GuessConfidence::High); + } + + SUBCASE("ambiguous extension, content disambiguates to OWL/XML") { + constexpr char const *owlxml_content = + "\n" + "\n" + ""; + auto guess = guess_format("/path/to/ontology.owl", owlxml_content); + CHECK(guess.syntax == ParsingFlag::OwlXml); + } + + SUBCASE("no extension, content provides guess") { + constexpr char const *nt_content = + " .\n"; + auto guess = guess_format("/path/to/data", nt_content); + CHECK(guess.syntax == ParsingFlag::NTriples); + } + } + + TEST_CASE("unsupported format errors") { + SUBCASE("OWL/XML throws on IStreamQuadIterator") { + constexpr char const *owlxml_content = + "\n" + "\n" + " \n" + ""; + std::istringstream iss{owlxml_content}; + CHECK_THROWS_AS(IStreamQuadIterator{iss + }, std::runtime_error); + } + + SUBCASE("JSON-LD throws on IStreamQuadIterator") { + constexpr char const *jsonld_content = + "{\"@context\": \"http://schema.org/\", \"name\": \"Jane\"}"; + std::istringstream iss{jsonld_content}; + CHECK_THROWS_AS(IStreamQuadIterator{iss}, std::runtime_error); + } + + SUBCASE("explicit OwlXml flag throws") { + std::istringstream iss{"whatever"}; + CHECK_THROWS_AS((IStreamQuadIterator{iss, ParsingFlag::OwlXml}), std::runtime_error); + } + + SUBCASE("explicit JsonLd flag throws") { + std::istringstream iss{"whatever"}; + CHECK_THROWS_AS((IStreamQuadIterator{iss, ParsingFlag::JsonLd}), std::runtime_error); + } + } + + TEST_CASE("Auto mode end-to-end with IStreamQuadIterator") { + + SUBCASE("N-Triples auto-detected") { + constexpr char const *nt_content = + " \"hello\" .\n" + " .\n"; + std::istringstream iss{nt_content}; + IStreamQuadIterator qit{iss}; // Auto mode (default) + + CHECK(qit.detected_format().syntax == ParsingFlag::NTriples); + size_t n = 0; + for (; qit != std::default_sentinel; ++qit) { + CHECK(qit->has_value()); + ++n; + } + CHECK_EQ(n, 2); + } + + SUBCASE("Turtle auto-detected via @prefix") { + constexpr char const *ttl_content = + "@prefix ex: .\n" + "ex:s ex:p \"test\" .\n"; + std::istringstream iss{ttl_content}; + IStreamQuadIterator qit{iss}; + + CHECK(qit.detected_format().syntax == ParsingFlag::Turtle); + size_t n = 0; + for (; qit != std::default_sentinel; ++qit) { + CHECK(qit->has_value()); + ++n; + } + CHECK_EQ(n, 1); + } + + SUBCASE("N-Quads auto-detected") { + constexpr char const *nq_content = + " .\n"; + std::istringstream iss{nq_content}; + IStreamQuadIterator qit{iss}; + + CHECK(qit.detected_format().syntax == ParsingFlag::NQuads); + CHECK(qit != std::default_sentinel); + CHECK(qit->has_value()); + CHECK(qit->value().graph() == IRI{"http://example/g"}); + } + + SUBCASE("RDF/XML auto-detected") { + constexpr char const *rdfxml_content = + "\n" + "\n" + " \n" + " hello\n" + " \n" + ""; + std::istringstream iss{rdfxml_content}; + IStreamQuadIterator qit{iss}; + + CHECK(qit.detected_format().syntax == ParsingFlag::RdfXml); + CHECK(qit != std::default_sentinel); + CHECK(qit->has_value()); + } + } + + TEST_CASE("Explicit flags backward compatibility") { + + SUBCASE("explicit Turtle") { + constexpr char const *ttl_content = + "@prefix ex: .\n" + "ex:s ex:p \"test\" .\n"; + std::istringstream iss{ttl_content}; + IStreamQuadIterator qit{iss, ParsingFlag::Turtle}; + + CHECK(qit.detected_format().syntax == ParsingFlag::Turtle); + CHECK(qit != std::default_sentinel); + CHECK(qit->has_value()); + } + + SUBCASE("explicit NTriples") { + std::istringstream iss{R"( "string" .)"}; + IStreamQuadIterator qit{iss, ParsingFlag::NTriples}; + + CHECK(qit.detected_format().syntax == ParsingFlag::NTriples); + CHECK(qit != std::default_sentinel); + CHECK(qit->has_value()); + } + + SUBCASE("explicit NQuads") { + std::stringstream str{" .\n"}; + IStreamQuadIterator qit{str, ParsingFlag::NQuads}; + + CHECK(qit.detected_format().syntax == ParsingFlag::NQuads); + CHECK(qit != std::default_sentinel); + CHECK(qit->has_value()); + } + + SUBCASE("explicit TriG") { + std::stringstream str{" { .}"}; + IStreamQuadIterator qit{str, ParsingFlag::TriG}; + + CHECK(qit.detected_format().syntax == ParsingFlag::TriG); + CHECK(qit != std::default_sentinel); + CHECK(qit->has_value()); + } + + SUBCASE("explicit RdfXml") { + constexpr char const *rdfxml_content = + "\n" + "\n" + " \n" + " hello\n" + " \n" + ""; + std::istringstream iss{rdfxml_content}; + IStreamQuadIterator qit{iss, ParsingFlag::RdfXml}; + + CHECK(qit.detected_format().syntax == ParsingFlag::RdfXml); + CHECK(qit != std::default_sentinel); + CHECK(qit->has_value()); + } + } + + TEST_CASE("RDFFileParser auto mode") { + + SUBCASE("ttl file auto-detected") { + // tests_RDFFileParser_simple.ttl is a pure N-Triples file (no prefixes) + // Auto mode should detect it and parse correctly + size_t count = 0; + for (auto const &v : RDFFileParser{"./tests_RDFFileParser_simple.ttl"}) { + if (v.has_value()) { + ++count; + } else if (count == 3) { + // expected error on the invalid date + ++count; + } + } + CHECK(count == 4); + } + } + + TEST_CASE("Auto mode with fopen C-like API") { + SUBCASE("N-Triples via fopen") { + static constexpr char const *path = "/tmp/rdf4cpp-format-guess-test.nt"; + { + auto *f = fopen(path, "w"); + fprintf(f, " \"hello\" .\n"); + fclose(f); + } + + auto *f = fopen(path, "r"); + IStreamQuadIterator qit{f, + reinterpret_cast(fread), + reinterpret_cast(ferror), + reinterpret_cast(feof)}; + CHECK(qit.detected_format().syntax == ParsingFlag::NTriples); + CHECK(qit != std::default_sentinel); + CHECK(qit->has_value()); + + ++qit; + CHECK(qit == std::default_sentinel); + + fclose(f); + remove(path); + } + } +} diff --git a/tests/parser/tests_FormatGuess_realworld.cpp b/tests/parser/tests_FormatGuess_realworld.cpp new file mode 100644 index 000000000..55c5dd34f --- /dev/null +++ b/tests/parser/tests_FormatGuess_realworld.cpp @@ -0,0 +1,322 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN + +#include + +#include +#include + +#include +#include +#include + +using namespace rdf4cpp; +using namespace rdf4cpp::parser; + +// --- CURL helper (adopted from tests_XMLParser.cpp) --- + +static size_t write_callback(void const *contents, size_t size, size_t nmemb, void *userp) { + static_cast(userp)->append(static_cast(contents), size * nmemb); + return size * nmemb; +} + +static std::string fetch_url(std::string const &url) { + std::string result; + CURL *curl = curl_easy_init(); + REQUIRE(curl != nullptr); + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30L); + CURLcode res = curl_easy_perform(curl); + curl_easy_cleanup(curl); + REQUIRE_EQ(res, CURLE_OK); + return result; +} + +/** + * Check content sniffing detects expected format, check combined guess with filename, + * and for supported formats verify auto-mode parsing produces quads. + * For unsupported formats, verify an exception is thrown. + */ +static void check_detection_and_parse(std::string const &content, + std::string const &filename, + ParsingFlag expected_syntax, + bool expect_parseable) { + INFO("file: ", filename); + + auto prefix = std::string_view{content}.substr(0, 4096); + auto content_guess = guess_format_from_content(prefix); + CHECK_MESSAGE(content_guess.is_known(), "Content sniffing should produce a known format for ", filename); + CHECK_MESSAGE(content_guess.syntax == expected_syntax, + "Expected syntax ", static_cast(expected_syntax), + " but got ", static_cast(content_guess.syntax), " for ", filename); + + auto combined = guess_format(filename, prefix); + CHECK_MESSAGE(combined.is_known(), "Combined guess should be known for ", filename); + CHECK_MESSAGE(combined.syntax == expected_syntax, + "Combined guess expected ", static_cast(expected_syntax), + " but got ", static_cast(combined.syntax), " for ", filename); + + if (expect_parseable) { + std::istringstream iss{content}; + IStreamQuadIterator qit{iss}; + CHECK_MESSAGE(qit.detected_format().syntax == expected_syntax, + "Parser detected wrong format for ", filename); + size_t quad_count = 0; + for (; qit != std::default_sentinel; ++qit) { + if (qit->has_value()) ++quad_count; + } + CHECK_MESSAGE(quad_count > 0, "Expected at least one quad from ", filename); + } else { + std::istringstream iss{content}; + CHECK_THROWS_AS(IStreamQuadIterator{iss}, std::runtime_error); + } +} + +// ============================================================================ +// Real-world file tests — download and verify detection + parsing +// ============================================================================ + +TEST_SUITE("FormatGuess real-world files") { + + // --- N-Triples (.nt) --- + + TEST_CASE("N-Triples: W3C rdf-n-triples tests") { + auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-n-triples/"); + + SUBCASE("literal.nt") { + check_detection_and_parse(fetch_url(base + "literal.nt"), "literal.nt", ParsingFlag::NTriples, true); + } + SUBCASE("literal_all_controls.nt") { + check_detection_and_parse(fetch_url(base + "literal_all_controls.nt"), "literal_all_controls.nt", ParsingFlag::NTriples, true); + } + SUBCASE("langtagged_string.nt") { + check_detection_and_parse(fetch_url(base + "langtagged_string.nt"), "langtagged_string.nt", ParsingFlag::NTriples, true); + } + SUBCASE("comment_following_triple.nt") { + check_detection_and_parse(fetch_url(base + "comment_following_triple.nt"), "comment_following_triple.nt", ParsingFlag::NTriples, true); + } + SUBCASE("literal_true.nt") { + check_detection_and_parse(fetch_url(base + "literal_true.nt"), "literal_true.nt", ParsingFlag::NTriples, true); + } + } + + TEST_CASE("N-Triples: W3C Turtle test outputs (.nt)") { + auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-turtle/"); + + SUBCASE("first.nt") { + check_detection_and_parse(fetch_url(base + "first.nt"), "first.nt", ParsingFlag::NTriples, true); + } + SUBCASE("IRIREF_datatype.nt") { + check_detection_and_parse(fetch_url(base + "IRIREF_datatype.nt"), "IRIREF_datatype.nt", ParsingFlag::NTriples, true); + } + SUBCASE("bareword_a_predicate.nt") { + check_detection_and_parse(fetch_url(base + "bareword_a_predicate.nt"), "bareword_a_predicate.nt", ParsingFlag::NTriples, true); + } + } + + TEST_CASE("N-Triples: Serd test outputs") { + auto const base = std::string("https://raw.githubusercontent.com/drobilla/serd/main/test/extra/abbreviate/"); + + SUBCASE("collapse-predicates.nt") { + check_detection_and_parse(fetch_url(base + "collapse-predicates.nt"), "collapse-predicates.nt", ParsingFlag::NTriples, true); + } + SUBCASE("collapse-subjects.nt") { + check_detection_and_parse(fetch_url(base + "collapse-subjects.nt"), "collapse-subjects.nt", ParsingFlag::NTriples, true); + } + } + + // --- N-Quads (.nq) --- + + TEST_CASE("N-Quads: W3C rdf-trig test outputs (.nq)") { + auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-trig/"); + + // NQ files with 4 terms should be detected as NQuads + SUBCASE("alternating_iri_graphs.nq") { + check_detection_and_parse(fetch_url(base + "alternating_iri_graphs.nq"), "alternating_iri_graphs.nq", ParsingFlag::NQuads, true); + } + } + + // --- Turtle (.ttl) --- + + TEST_CASE("Turtle: W3C rdf-turtle tests") { + auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-turtle/"); + + SUBCASE("first.ttl") { + check_detection_and_parse(fetch_url(base + "first.ttl"), "first.ttl", ParsingFlag::Turtle, true); + } + SUBCASE("SPARQL_style_prefix.ttl") { + check_detection_and_parse(fetch_url(base + "SPARQL_style_prefix.ttl"), "SPARQL_style_prefix.ttl", ParsingFlag::Turtle, true); + } + SUBCASE("SPARQL_style_base.ttl") { + check_detection_and_parse(fetch_url(base + "SPARQL_style_base.ttl"), "SPARQL_style_base.ttl", ParsingFlag::Turtle, true); + } + SUBCASE("bareword_a_predicate.ttl") { + check_detection_and_parse(fetch_url(base + "bareword_a_predicate.ttl"), "bareword_a_predicate.ttl", ParsingFlag::Turtle, true); + } + SUBCASE("collection_object.ttl") { + check_detection_and_parse(fetch_url(base + "collection_object.ttl"), "collection_object.ttl", ParsingFlag::Turtle, true); + } + SUBCASE("labeled_blank_node_subject.ttl") { + // Content is `_:s

.` which is valid N-Triples (subset of Turtle). + // Content sniffing correctly detects NTriples; combined guess with .ttl + // extension overrides to Turtle. + auto content = fetch_url(base + "labeled_blank_node_subject.ttl"); + auto prefix = std::string_view{content}.substr(0, 4096); + auto content_guess = guess_format_from_content(prefix); + CHECK(content_guess.syntax == ParsingFlag::NTriples); + + auto combined = guess_format("labeled_blank_node_subject.ttl", prefix); + CHECK(combined.syntax == ParsingFlag::Turtle); + + std::istringstream iss{content}; + IStreamQuadIterator qit{iss}; + size_t count = 0; + for (; qit != std::default_sentinel; ++qit) { + if (qit->has_value()) ++count; + } + CHECK(count > 0); + } + } + + TEST_CASE("Turtle: Serd project file") { + SUBCASE("serd.ttl") { + check_detection_and_parse(fetch_url("https://raw.githubusercontent.com/drobilla/serd/main/serd.ttl"), "serd.ttl", ParsingFlag::Turtle, true); + } + } + + TEST_CASE("Turtle: Serd abbreviation tests") { + auto const base = std::string("https://raw.githubusercontent.com/drobilla/serd/main/test/extra/abbreviate/"); + + SUBCASE("collapse-predicates.ttl") { + check_detection_and_parse(fetch_url(base + "collapse-predicates.ttl"), "collapse-predicates.ttl", ParsingFlag::Turtle, true); + } + SUBCASE("collapse-subjects.ttl") { + check_detection_and_parse(fetch_url(base + "collapse-subjects.ttl"), "collapse-subjects.ttl", ParsingFlag::Turtle, true); + } + } + + // --- TriG (.trig) --- + + TEST_CASE("TriG: W3C rdf-trig tests") { + auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-trig/"); + + SUBCASE("LITERAL1.trig") { + check_detection_and_parse(fetch_url(base + "LITERAL1.trig"), "LITERAL1.trig", ParsingFlag::TriG, true); + } + SUBCASE("trig-kw-graph-01.trig") { + check_detection_and_parse(fetch_url(base + "trig-kw-graph-01.trig"), "trig-kw-graph-01.trig", ParsingFlag::TriG, true); + } + SUBCASE("alternating_iri_graphs.trig") { + check_detection_and_parse(fetch_url(base + "alternating_iri_graphs.trig"), "alternating_iri_graphs.trig", ParsingFlag::TriG, true); + } + SUBCASE("anonymous_blank_node_graph.trig") { + check_detection_and_parse(fetch_url(base + "anonymous_blank_node_graph.trig"), "anonymous_blank_node_graph.trig", ParsingFlag::TriG, true); + } + SUBCASE("labeled_blank_node_graph.trig") { + check_detection_and_parse(fetch_url(base + "labeled_blank_node_graph.trig"), "labeled_blank_node_graph.trig", ParsingFlag::TriG, true); + } + } + + // --- RDF/XML (.rdf) --- + + TEST_CASE("RDF/XML: W3C rdf-xml tests") { + auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-xml/"); + + SUBCASE("amp-in-url/test001.rdf") { + check_detection_and_parse(fetch_url(base + "amp-in-url/test001.rdf"), "test001.rdf", ParsingFlag::RdfXml, true); + } + SUBCASE("datatypes/test001.rdf") { + check_detection_and_parse(fetch_url(base + "datatypes/test001.rdf"), "datatypes_test001.rdf", ParsingFlag::RdfXml, true); + } + SUBCASE("rdf-charmod-literals/test001.rdf") { + check_detection_and_parse(fetch_url(base + "rdf-charmod-literals/test001.rdf"), "rdf-charmod-literals_test001.rdf", ParsingFlag::RdfXml, true); + } + SUBCASE("rdf-charmod-uris/test001.rdf") { + check_detection_and_parse(fetch_url(base + "rdf-charmod-uris/test001.rdf"), "rdf-charmod-uris_test001.rdf", ParsingFlag::RdfXml, true); + } + SUBCASE("rdf-containers-syntax-vs-schema/test001.rdf") { + check_detection_and_parse(fetch_url(base + "rdf-containers-syntax-vs-schema/test001.rdf"), "rdf-containers_test001.rdf", ParsingFlag::RdfXml, true); + } + } + + TEST_CASE("RDF/XML: .owl files with rdf:RDF root (ambiguous extension)") { + SUBCASE("pizza.owl — RDF/XML with .owl extension") { + auto content = fetch_url("https://raw.githubusercontent.com/owlcs/pizza-ontology/master/pizza.owl"); + auto prefix = std::string_view{content}.substr(0, 4096); + auto guess = guess_format("pizza.owl", prefix); + CHECK(guess.syntax == ParsingFlag::RdfXml); + CHECK(guess.is_known()); + + std::istringstream iss{content}; + IStreamQuadIterator qit{iss}; + CHECK(qit.detected_format().syntax == ParsingFlag::RdfXml); + size_t count = 0; + for (; qit != std::default_sentinel; ++qit) { + if (qit->has_value()) ++count; + } + CHECK(count > 0); + } + } + + // --- OWL/XML (.owx) — detected but unsupported --- + + TEST_CASE("OWL/XML: horned-owl test files (.owx)") { + auto const base = std::string("https://raw.githubusercontent.com/phillord/horned-owl/main/src/ont/owl-xml/"); + + SUBCASE("class.owx") { + check_detection_and_parse(fetch_url(base + "class.owx"), "class.owx", ParsingFlag::OwlXml, false); + } + SUBCASE("annotation.owx") { + check_detection_and_parse(fetch_url(base + "annotation.owx"), "annotation.owx", ParsingFlag::OwlXml, false); + } + SUBCASE("and.owx") { + check_detection_and_parse(fetch_url(base + "and.owx"), "and.owx", ParsingFlag::OwlXml, false); + } + SUBCASE("class-assertion.owx") { + check_detection_and_parse(fetch_url(base + "class-assertion.owx"), "class-assertion.owx", ParsingFlag::OwlXml, false); + } + SUBCASE("ontology-annotation.owx") { + check_detection_and_parse(fetch_url(base + "ontology-annotation.owx"), "ontology-annotation.owx", ParsingFlag::OwlXml, false); + } + } + + TEST_CASE("OWL/XML: .owl files with Ontology root (ambiguous extension)") { + SUBCASE("Time.owl — OWL/XML with .owl extension") { + auto content = fetch_url("https://raw.githubusercontent.com/usnistgov/pdso/master/OWL/Time.owl"); + auto prefix = std::string_view{content}.substr(0, 4096); + auto guess = guess_format("Time.owl", prefix); + CHECK(guess.syntax == ParsingFlag::OwlXml); + + std::istringstream iss{content}; + CHECK_THROWS_AS(IStreamQuadIterator{iss}, std::runtime_error); + } + } + + // --- JSON-LD (.jsonld) — detected but unsupported --- + + TEST_CASE("JSON-LD: W3C json-ld-api examples") { + auto const base = std::string("https://raw.githubusercontent.com/w3c/json-ld-api/main/examples/"); + + SUBCASE("Sample-JSON-LD-document.jsonld") { + check_detection_and_parse(fetch_url(base + "Sample-JSON-LD-document.jsonld"), "Sample-JSON-LD-document.jsonld", ParsingFlag::JsonLd, false); + } + SUBCASE("Compacted-sample-document-compacted.jsonld") { + check_detection_and_parse(fetch_url(base + "Compacted-sample-document-compacted.jsonld"), "Compacted-sample-document-compacted.jsonld", ParsingFlag::JsonLd, false); + } + SUBCASE("Expanded-sample-document.jsonld") { + check_detection_and_parse(fetch_url(base + "Expanded-sample-document.jsonld"), "Expanded-sample-document.jsonld", ParsingFlag::JsonLd, false); + } + SUBCASE("JSON-LD-document-in-compact-form.jsonld") { + check_detection_and_parse(fetch_url(base + "JSON-LD-document-in-compact-form.jsonld"), "JSON-LD-document-in-compact-form.jsonld", ParsingFlag::JsonLd, false); + } + } + + TEST_CASE("JSON-LD: json-ld.org context files") { + SUBCASE("person.jsonld") { + check_detection_and_parse(fetch_url("https://raw.githubusercontent.com/json-ld/json-ld.org/main/contexts/person.jsonld"), "person.jsonld", ParsingFlag::JsonLd, false); + } + } +} diff --git a/tests/parser/tests_IStreamQuadIterator.cpp b/tests/parser/tests_IStreamQuadIterator.cpp index 8df3754f8..f06b5f152 100644 --- a/tests/parser/tests_IStreamQuadIterator.cpp +++ b/tests/parser/tests_IStreamQuadIterator.cpp @@ -328,7 +328,7 @@ TEST_SUITE("IStreamQuadIterator") { constexpr char const *triples = " _:bnode .\n"; std::istringstream iss{triples}; - IStreamQuadIterator qit{iss, ParsingFlag::NoParseBlankNode}; + IStreamQuadIterator qit{iss, ParsingFlag::NoParseBlankNode | ParsingFlag::Turtle}; CHECK_NE(qit, std::default_sentinel); CHECK(!qit->has_value()); From 544c767553a154bfd761dafdd646c344d02cb827 Mon Sep 17 00:00:00 2001 From: bigerl Date: Sun, 1 Mar 2026 20:08:14 +0100 Subject: [PATCH 2/7] improved FormatGuess + docs --- src/rdf4cpp/parser/FormatGuess.cpp | 273 +++++++++++++++-------------- src/rdf4cpp/parser/FormatGuess.md | 80 +++++++++ 2 files changed, 217 insertions(+), 136 deletions(-) create mode 100644 src/rdf4cpp/parser/FormatGuess.md diff --git a/src/rdf4cpp/parser/FormatGuess.cpp b/src/rdf4cpp/parser/FormatGuess.cpp index c907d69e8..e3422dda4 100644 --- a/src/rdf4cpp/parser/FormatGuess.cpp +++ b/src/rdf4cpp/parser/FormatGuess.cpp @@ -1,62 +1,85 @@ #include "FormatGuess.hpp" #include -#include #include +#include + namespace rdf4cpp::parser { // --- helpers --- - static std::string to_lower(std::string_view sv) { - std::string s{sv}; - std::ranges::transform(s, s.begin(), [](unsigned char c) { - return std::tolower(c); - }); - return s; + struct SplitLine { + std::string_view line; + std::string_view rest; + }; + + // RDF syntax delimiters are ASCII bytes (0x00-0x7F) which never appear as + // UTF-8 continuation bytes. Byte-level scanning for these markers is safe + // in valid UTF-8. + + static std::string to_lower(std::string_view const sv) { + return una::cases::to_lowercase_utf8(sv); } - static std::string_view skip_whitespace_and_bom(std::string_view sv) { - // skip UTF-8 BOM - if (sv.size() >= 3 && sv[0] == '\xEF' && sv[1] == '\xBB' && sv[2] == '\xBF') { - sv.remove_prefix(3); + static bool is_ascii_ws(char const c) noexcept { + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; + } + + static std::string_view ltrim_ascii_whitespace(std::string_view const sv) noexcept { + auto const it = std::ranges::find_if_not(sv, is_ascii_ws); + return sv.substr(static_cast(it - sv.begin())); + } + + static std::string_view trim_ascii_whitespace(std::string_view const sv) noexcept { + auto result = ltrim_ascii_whitespace(sv); + while (!result.empty() && is_ascii_ws(result.back())) { + result.remove_suffix(1); + } + return result; + } + + /// Split into first line and everything after the newline. If no newline, rest is empty. + static SplitLine split_next_line(std::string_view const sv) noexcept { + auto const eol = sv.find('\n'); + if (eol == std::string_view::npos) { + return {.line = sv, .rest = {}}; } - // skip leading whitespace - while (!sv.empty() && (sv.front() == ' ' || sv.front() == '\t' || sv.front() == '\n' || sv.front() == '\r')) { - sv.remove_prefix(1); + return {.line = sv.substr(0, eol), .rest = sv.substr(eol + 1)}; + } + + static std::string_view skip_whitespace_and_bom(std::string_view const sv) { + // skip UTF-8 BOM + if (sv.starts_with("\xEF\xBB\xBF")) { + return ltrim_ascii_whitespace(sv.substr(3)); } - return sv; + return ltrim_ascii_whitespace(sv); } - static bool starts_with_icase(std::string_view haystack, std::string_view needle) { + static bool starts_with_icase(std::string_view const haystack, std::string_view const needle) { if (haystack.size() < needle.size()) { return false; } - for (size_t i = 0; i < needle.size(); ++i) { - if (std::tolower(static_cast(haystack[i])) != std::tolower(static_cast(needle[i]))) { - return false; - } - } - return true; + return una::cases::to_lowercase_utf8(haystack.substr(0, needle.size())) == una::cases::to_lowercase_utf8(needle); } - static bool contains(std::string_view haystack, std::string_view needle) { + static bool contains(std::string_view const haystack, std::string_view const needle) { return haystack.find(needle) != std::string_view::npos; } - static bool contains_icase(std::string_view haystack, std::string_view needle) { + static bool contains_icase(std::string_view const haystack, std::string_view const needle) { if (needle.size() > haystack.size()) { return false; } - auto lower_hay = to_lower(haystack); - auto lower_needle = to_lower(needle); + auto const lower_hay = to_lower(haystack); + auto const lower_needle = to_lower(needle); return lower_hay.find(lower_needle) != std::string::npos; } // --- extension mapping --- - FormatGuess guess_format_from_extension(std::string_view extension) noexcept { - auto ext = to_lower(extension); + FormatGuess guess_format_from_extension(std::string_view const extension) noexcept { + auto const ext = to_lower(extension); if (ext == ".ttl" || ext == ".turtle") { return {ParsingFlag::Turtle, GuessConfidence::High}; @@ -88,7 +111,7 @@ namespace rdf4cpp::parser { return {ParsingFlag::Auto, GuessConfidence::None}; } - FormatGuess guess_format_from_path(std::string_view file_path) noexcept { + FormatGuess guess_format_from_path(std::string_view const file_path) noexcept { // find last path separator auto const last_sep = file_path.find_last_of("/\\"); auto const filename = (last_sep != std::string_view::npos) ? file_path.substr(last_sep + 1) : file_path; @@ -104,9 +127,8 @@ namespace rdf4cpp::parser { // --- content sniffing --- - static bool has_trig_markers(std::string_view content) { + static bool has_trig_markers(std::string_view const content) { // Look for GRAPH keyword or { } blocks outside of string literals - // Simple heuristic: look for GRAPH keyword or standalone { not inside quotes if (contains_icase(content, "GRAPH")) { return true; } @@ -115,11 +137,13 @@ namespace rdf4cpp::parser { // or just { at start of a line (default graph block) bool in_string = false; char string_delim = 0; - for (size_t i = 0; i < content.size(); ++i) { - char c = content[i]; + auto cursor = content; + while (!cursor.empty()) { + char const c = cursor.front(); + cursor.remove_prefix(1); if (in_string) { - if (c == '\\' && i + 1 < content.size()) { - ++i; // skip escaped char + if (c == '\\' && !cursor.empty()) { + cursor.remove_prefix(1); // skip escaped char continue; } if (c == string_delim) { @@ -139,15 +163,15 @@ namespace rdf4cpp::parser { return false; } - static FormatGuess sniff_xml_content(std::string_view content) { + static FormatGuess sniff_xml_content(std::string_view const content) { // Check for OWL/XML first — more specific markers. // OWL/XML uses root element (not ) and may still // declare xmlns:rdf as a namespace prefix, so checking OWL/XML before // RDF/XML avoids false positives. - bool has_ontology_root = contains(content, " . (3 terms = NT, 4 terms = NQ) bool found_4_terms = false; bool found_any_triple = false; - size_t pos = 0; - while (pos < content.size()) { - // find end of line - auto eol = content.find('\n', pos); - auto line = content.substr(pos, eol == std::string_view::npos ? std::string_view::npos : eol - pos); - pos = (eol == std::string_view::npos) ? content.size() : eol + 1; - - // trim - while (!line.empty() && (line.front() == ' ' || line.front() == '\t')) { - line.remove_prefix(1); - } - while (!line.empty() && (line.back() == ' ' || line.back() == '\t' || line.back() == '\r')) { - line.remove_suffix(1); - } + auto remaining = content; + while (!remaining.empty()) { + auto const [line_raw, rest] = split_next_line(remaining); + remaining = rest; + auto const line = trim_ascii_whitespace(line_raw); // skip empty lines and comments if (line.empty() || line.front() == '#') { @@ -202,79 +217,70 @@ namespace rdf4cpp::parser { // that appear outside of IRIs and literals correctly. int term_count = 0; bool found_dot = false; - size_t i = 0; - while (i < line.size()) { + auto cursor = line; + while (!cursor.empty()) { // skip whitespace - while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) { - ++i; - } - if (i >= line.size()) { + cursor = ltrim_ascii_whitespace(cursor); + if (cursor.empty()) { break; } - char c = line[i]; + char const c = cursor.front(); if (c == '.') { found_dot = true; - ++i; - // skip trailing whitespace and optional comment after . - while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) { - ++i; - } - if (i < line.size() && line[i] == '#') { - // inline comment after dot — valid - } + cursor.remove_prefix(1); break; } else if (c == '#') { // comment at top level (outside IRI/literal) — not valid N-Triples - // unless we already found the dot return {ParsingFlag::Auto, GuessConfidence::None}; } else if (c == '<') { // IRI — find closing > - auto close = line.find('>', i); + auto const close = cursor.find('>'); if (close == std::string_view::npos) { break; } - i = close + 1; + cursor.remove_prefix(close + 1); ++term_count; - } else if (c == '_' && i + 1 < line.size() && line[i + 1] == ':') { + } else if (cursor.starts_with("_:")) { // blank node — skip to next whitespace - while (i < line.size() && line[i] != ' ' && line[i] != '\t') { - ++i; - } + auto const ws = std::ranges::find_if(cursor, is_ascii_ws); + cursor.remove_prefix(static_cast(ws - cursor.begin())); ++term_count; } else if (c == '"') { // literal — find unescaped closing quote, then skip datatype/lang - ++i; - while (i < line.size()) { - if (line[i] == '\\') { - i += 2; + cursor.remove_prefix(1); + while (!cursor.empty()) { + if (cursor.front() == '\\') { + cursor.remove_prefix(std::min(2, cursor.size())); continue; } - if (line[i] == '"') { + if (cursor.front() == '"') { break; } - ++i; + cursor.remove_prefix(1); } - if (i < line.size()) { - ++i; // skip closing quote + if (!cursor.empty()) { + cursor.remove_prefix(1); // skip closing quote } // skip ^^ or @lang (which may contain # inside <...>) - if (i + 1 < line.size() && line[i] == '^' && line[i + 1] == '^') { - i += 2; - if (i < line.size() && line[i] == '<') { - auto close = line.find('>', i); + if (cursor.starts_with("^^")) { + cursor.remove_prefix(2); + if (!cursor.empty() && cursor.front() == '<') { + auto const close = cursor.find('>'); if (close != std::string_view::npos) { - i = close + 1; + cursor.remove_prefix(close + 1); } } else { - while (i < line.size() && line[i] != ' ' && line[i] != '\t' && line[i] != '.') { - ++i; - } - } - } else if (i < line.size() && line[i] == '@') { - while (i < line.size() && line[i] != ' ' && line[i] != '\t' && line[i] != '.') { - ++i; + auto const end = std::ranges::find_if(cursor, [](char ch) { + return is_ascii_ws(ch) || ch == '.'; + }); + cursor.remove_prefix(static_cast(end - cursor.begin())); } + } else if (!cursor.empty() && cursor.front() == '@') { + auto const end = std::ranges::find_if(cursor, [](char ch) { + return is_ascii_ws(ch) || ch == '.'; + }); + cursor.remove_prefix(static_cast(end - cursor.begin())); } ++term_count; } else { @@ -306,30 +312,27 @@ namespace rdf4cpp::parser { return {ParsingFlag::NTriples, GuessConfidence::Medium}; } - static std::string_view skip_comments(std::string_view sv) { + static std::string_view skip_comments(std::string_view const sv) { // skip lines starting with # (comments in N-Triples/Turtle/TriG) - while (!sv.empty() && sv.front() == '#') { - auto eol = sv.find('\n'); + auto cursor = sv; + while (!cursor.empty() && cursor.front() == '#') { + auto const eol = cursor.find('\n'); if (eol == std::string_view::npos) { return {}; } - sv.remove_prefix(eol + 1); - // skip whitespace after comment line - while (!sv.empty() && (sv.front() == ' ' || sv.front() == '\t' || sv.front() == '\n' || sv.front() == '\r')) { - sv.remove_prefix(1); - } + cursor = ltrim_ascii_whitespace(cursor.substr(eol + 1)); } - return sv; + return cursor; } - FormatGuess guess_format_from_content(std::string_view prefix) noexcept { - auto full_content = skip_whitespace_and_bom(prefix); + FormatGuess guess_format_from_content(std::string_view const prefix) noexcept { + auto const full_content = skip_whitespace_and_bom(prefix); if (full_content.empty()) { return {ParsingFlag::Auto, GuessConfidence::None}; } // Skip leading comment lines for the first-byte checks - auto content = skip_comments(full_content); + auto const content = skip_comments(full_content); if (content.empty()) { return {ParsingFlag::Auto, GuessConfidence::None}; } @@ -344,13 +347,7 @@ namespace rdf4cpp::parser { // JSON-based formats — but `{` can also be a TriG default graph block, // and `[` can be a TriG blank node graph name or a Turtle blank node property list. if (content.front() == '[') { - auto after_bracket = content.substr(1); - while (!after_bracket.empty() - && (after_bracket.front() == ' ' || after_bracket.front() == '\t' || after_bracket.front() == '\n' - || after_bracket.front() == '\r')) - { - after_bracket.remove_prefix(1); - } + auto const after_bracket = ltrim_ascii_whitespace(content.substr(1)); // JSON arrays start with `[` followed by `{`, `"`, `[`, number, true, false, null // Turtle/TriG blank nodes: `[]` or `[ predicate object ]` if (!after_bracket.empty() && (after_bracket.front() == ']' || after_bracket.front() == '<' || after_bracket.front() == '_')) { @@ -363,13 +360,7 @@ namespace rdf4cpp::parser { return sniff_json_content(content); } if (content.front() == '{') { - auto after_brace = content.substr(1); - while (!after_brace.empty() - && (after_brace.front() == ' ' || after_brace.front() == '\t' || after_brace.front() == '\n' - || after_brace.front() == '\r')) - { - after_brace.remove_prefix(1); - } + auto const after_brace = ltrim_ascii_whitespace(content.substr(1)); if (after_brace.empty() || after_brace.front() == '"') { return sniff_json_content(content); } @@ -394,8 +385,8 @@ namespace rdf4cpp::parser { } // Phase 2: try N-Triples / N-Quads line-based detection - if (content.front() == '<' || (content.front() == '_' && content.size() > 1 && content[1] == ':')) { - auto result = sniff_ntriples_or_nquads(full_content); + if (content.front() == '<' || content.starts_with("_:")) { + auto const result = sniff_ntriples_or_nquads(full_content); if (result.is_known()) { return result; } @@ -406,50 +397,60 @@ namespace rdf4cpp::parser { // Phase 3: check for Turtle/TriG syntax markers in content that didn't // match any earlier patterns (e.g. Turtle without @prefix directives) { + static constexpr std::string_view turtle_markers = ";,()[]{}"; bool has_turtle_marker = false; bool in_iri = false; bool in_string = false; char string_delim = 0; + char prev_char = 0; - for (size_t i = 0; i < content.size(); ++i) { - char c = content[i]; + auto cursor = content; + while (!cursor.empty()) { + char const c = cursor.front(); + cursor.remove_prefix(1); if (in_string) { - if (c == '\\' && i + 1 < content.size()) { - ++i; + if (c == '\\' && !cursor.empty()) { + prev_char = cursor.front(); + cursor.remove_prefix(1); continue; } if (c == string_delim) { in_string = false; } + prev_char = c; continue; } if (in_iri) { if (c == '>') { in_iri = false; } + prev_char = c; continue; } if (c == '<') { in_iri = true; + prev_char = c; continue; } if (c == '"' || c == '\'') { in_string = true; string_delim = c; + prev_char = c; continue; } // Turtle/TriG syntax markers not valid in N-Triples - if (c == ';' || c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}') { + if (turtle_markers.find(c) != std::string_view::npos) { has_turtle_marker = true; break; } // bare `a` surrounded by whitespace = Turtle shorthand for rdf:type - if (c == 'a' && i > 0 && (content[i - 1] == ' ' || content[i - 1] == '\t') && i + 1 < content.size() - && (content[i + 1] == ' ' || content[i + 1] == '\t')) + if (c == 'a' && (prev_char == ' ' || prev_char == '\t') && !cursor.empty() + && (cursor.front() == ' ' || cursor.front() == '\t')) { has_turtle_marker = true; break; } + prev_char = c; } if (has_turtle_marker) { @@ -463,9 +464,9 @@ namespace rdf4cpp::parser { return {ParsingFlag::Auto, GuessConfidence::None}; } - FormatGuess guess_format(std::string_view file_path, std::string_view prefix) noexcept { - auto ext_guess = guess_format_from_path(file_path); - auto content_guess = guess_format_from_content(prefix); + FormatGuess guess_format(std::string_view const file_path, std::string_view const prefix) noexcept { + auto const ext_guess = guess_format_from_path(file_path); + auto const content_guess = guess_format_from_content(prefix); // If extension gives a strong match and no content sniffing needed if (ext_guess.confidence == GuessConfidence::High) { diff --git a/src/rdf4cpp/parser/FormatGuess.md b/src/rdf4cpp/parser/FormatGuess.md new file mode 100644 index 000000000..9477eddba --- /dev/null +++ b/src/rdf4cpp/parser/FormatGuess.md @@ -0,0 +1,80 @@ +# FormatGuess — RDF Serialization Format Detection + +## Purpose + +`FormatGuess` provides automatic detection of RDF serialization formats from +file extensions and/or a content prefix (the first few hundred to few thousand +bytes). It returns a `FormatGuess` consisting of a `ParsingFlag` (the detected +syntax) and a `GuessConfidence` level. + +## Detection Strategy + +Three entry points, in order of specificity: + +| Function | Input | Returned confidence levels | +|---------------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `guess_format_from_extension()` | file extension | High (`.ttl`, `.nt`, …), Low (`.owl`, `.xml`), or None | +| `guess_format_from_content()` | byte prefix | High (XML root, `@prefix`, JSON-LD keywords), Medium (`PREFIX`/`BASE`, N-Triples/N-Quads grammar, TriG markers), Low (Turtle syntax markers, generic XML), or None | +| `guess_format()` | path + prefix | Certain (extension + content agree), otherwise delegates to the above | + +`guess_format()` combines extension and content results: when both agree the +confidence is boosted to **Certain**. + +## Content Sniffing Phases + +`guess_format_from_content()` inspects the prefix in three ordered phases. +Processing stops at the first match. + +### Phase 1 — Deterministic Checks + +Fast tests on the first non-whitespace, non-comment bytes: + +* ` Date: Sun, 1 Mar 2026 20:16:13 +0100 Subject: [PATCH 3/7] fix docs --- src/rdf4cpp/parser/FormatGuess.hpp | 1 + src/rdf4cpp/parser/FormatGuess.md | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/rdf4cpp/parser/FormatGuess.hpp b/src/rdf4cpp/parser/FormatGuess.hpp index 268734e7d..e56b18480 100644 --- a/src/rdf4cpp/parser/FormatGuess.hpp +++ b/src/rdf4cpp/parser/FormatGuess.hpp @@ -50,6 +50,7 @@ namespace rdf4cpp::parser { /** * Guess the RDF serialization format by inspecting a prefix of the file content. * At least 512 bytes recommended, 4096 bytes ideal. + * Strips a leading UTF-8 BOM and skips whitespace and #-comment lines before sniffing. */ [[nodiscard]] FormatGuess guess_format_from_content(std::string_view prefix) noexcept; diff --git a/src/rdf4cpp/parser/FormatGuess.md b/src/rdf4cpp/parser/FormatGuess.md index 9477eddba..4a34e723c 100644 --- a/src/rdf4cpp/parser/FormatGuess.md +++ b/src/rdf4cpp/parser/FormatGuess.md @@ -11,19 +11,20 @@ syntax) and a `GuessConfidence` level. Three entry points, in order of specificity: -| Function | Input | Returned confidence levels | -|---------------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `guess_format_from_extension()` | file extension | High (`.ttl`, `.nt`, …), Low (`.owl`, `.xml`), or None | -| `guess_format_from_content()` | byte prefix | High (XML root, `@prefix`, JSON-LD keywords), Medium (`PREFIX`/`BASE`, N-Triples/N-Quads grammar, TriG markers), Low (Turtle syntax markers, generic XML), or None | -| `guess_format()` | path + prefix | Certain (extension + content agree), otherwise delegates to the above | +| Function | Input | Returned confidence levels | +|---------------------------------|----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `guess_format_from_extension()` | file extension | High (`.ttl`, `.nt`, …), Low (`.owl`, `.xml`), or None | +| `guess_format_from_content()` | byte prefix | High (XML root, `@prefix`, JSON-LD keywords), Medium (`PREFIX`/`BASE`, N-Triples/N-Quads grammar, TriG markers), Low (Turtle/TriG syntax markers, generic XML), or None | +| `guess_format()` | path + prefix | Certain (extension + content agree), otherwise delegates to the above | `guess_format()` combines extension and content results: when both agree the confidence is boosted to **Certain**. ## Content Sniffing Phases -`guess_format_from_content()` inspects the prefix in three ordered phases. -Processing stops at the first match. +`guess_format_from_content()` first strips a UTF-8 BOM (if present) and +leading whitespace, then skips leading `#`-comment lines before inspecting +the prefix in three ordered phases. Processing stops at the first match. ### Phase 1 — Deterministic Checks @@ -49,8 +50,9 @@ Scan for syntax characters that are valid in Turtle/TriG but not in N-Triples: `;` `,` `(` `)` `[` `]` `{` `}` and the bare keyword `a` (rdf:type shorthand). Strings and IRIs are skipped to avoid false matches. -If markers are found, check for TriG-specific patterns (GRAPH keyword or -`{` outside strings) and return Turtle or TriG at Low confidence. +If markers are found, check for TriG-specific patterns (case-insensitive +GRAPH keyword or `{` outside strings) and return Turtle or TriG at Low +confidence. ## Encoding Assumptions From 745515349de8bff232fde8c50bcae54b2ad957c5 Mon Sep 17 00:00:00 2001 From: bigerl Date: Sun, 1 Mar 2026 20:31:54 +0100 Subject: [PATCH 4/7] refactoring --- src/rdf4cpp/parser/FormatGuess.cpp | 221 ++++++++++++++--------------- src/rdf4cpp/parser/FormatGuess.hpp | 2 +- tests/bench_SerDe.cpp | 2 +- 3 files changed, 111 insertions(+), 114 deletions(-) diff --git a/src/rdf4cpp/parser/FormatGuess.cpp b/src/rdf4cpp/parser/FormatGuess.cpp index e3422dda4..7261297fb 100644 --- a/src/rdf4cpp/parser/FormatGuess.cpp +++ b/src/rdf4cpp/parser/FormatGuess.cpp @@ -18,10 +18,6 @@ namespace rdf4cpp::parser { // UTF-8 continuation bytes. Byte-level scanning for these markers is safe // in valid UTF-8. - static std::string to_lower(std::string_view const sv) { - return una::cases::to_lowercase_utf8(sv); - } - static bool is_ascii_ws(char const c) noexcept { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } @@ -48,7 +44,7 @@ namespace rdf4cpp::parser { return {.line = sv.substr(0, eol), .rest = sv.substr(eol + 1)}; } - static std::string_view skip_whitespace_and_bom(std::string_view const sv) { + static std::string_view skip_whitespace_and_bom(std::string_view const sv) noexcept { // skip UTF-8 BOM if (sv.starts_with("\xEF\xBB\xBF")) { return ltrim_ascii_whitespace(sv.substr(3)); @@ -68,18 +64,13 @@ namespace rdf4cpp::parser { } static bool contains_icase(std::string_view const haystack, std::string_view const needle) { - if (needle.size() > haystack.size()) { - return false; - } - auto const lower_hay = to_lower(haystack); - auto const lower_needle = to_lower(needle); - return lower_hay.find(lower_needle) != std::string::npos; + return static_cast(una::caseless::find_utf8(haystack, needle)); } // --- extension mapping --- FormatGuess guess_format_from_extension(std::string_view const extension) noexcept { - auto const ext = to_lower(extension); + auto const ext = una::cases::to_lowercase_utf8(extension); if (ext == ".ttl" || ext == ".turtle") { return {ParsingFlag::Turtle, GuessConfidence::High}; @@ -116,7 +107,7 @@ namespace rdf4cpp::parser { auto const last_sep = file_path.find_last_of("/\\"); auto const filename = (last_sep != std::string_view::npos) ? file_path.substr(last_sep + 1) : file_path; - // find last dot in filename + // find the last dot in the filename auto const dot_pos = filename.rfind('.'); if (dot_pos == std::string_view::npos) { return {ParsingFlag::Auto, GuessConfidence::None}; @@ -133,8 +124,8 @@ namespace rdf4cpp::parser { return true; } - // Look for pattern like IRI/prefixed-name followed by { - // or just { at start of a line (default graph block) + // Look for a pattern like IRI/prefixed-name followed by { + // or just { at the start of a line (default graph block) bool in_string = false; char string_delim = 0; auto cursor = content; @@ -214,7 +205,7 @@ namespace rdf4cpp::parser { } // Count terms by walking through the line, handling # comments - // that appear outside of IRIs and literals correctly. + // that appear outside IRIs and literals correctly. int term_count = 0; bool found_dot = false; auto cursor = line; @@ -224,14 +215,13 @@ namespace rdf4cpp::parser { if (cursor.empty()) { break; } - char const c = cursor.front(); - if (c == '.') { + if (char const c = cursor.front(); c == '.') { found_dot = true; cursor.remove_prefix(1); break; } else if (c == '#') { - // comment at top level (outside IRI/literal) — not valid N-Triples + // comment at the top level (outside IRI/literal) — not valid N-Triples return {ParsingFlag::Auto, GuessConfidence::None}; } else if (c == '<') { // IRI — find closing > @@ -242,7 +232,7 @@ namespace rdf4cpp::parser { cursor.remove_prefix(close + 1); ++term_count; } else if (cursor.starts_with("_:")) { - // blank node — skip to next whitespace + // blank node — skip to the next whitespace auto const ws = std::ranges::find_if(cursor, is_ascii_ws); cursor.remove_prefix(static_cast(ws - cursor.begin())); ++term_count; @@ -266,18 +256,17 @@ namespace rdf4cpp::parser { if (cursor.starts_with("^^")) { cursor.remove_prefix(2); if (!cursor.empty() && cursor.front() == '<') { - auto const close = cursor.find('>'); - if (close != std::string_view::npos) { + if (auto const close = cursor.find('>'); close != std::string_view::npos) { cursor.remove_prefix(close + 1); } } else { - auto const end = std::ranges::find_if(cursor, [](char ch) { + auto const end = std::ranges::find_if(cursor, [](char const ch) noexcept { return is_ascii_ws(ch) || ch == '.'; }); cursor.remove_prefix(static_cast(end - cursor.begin())); } } else if (!cursor.empty() && cursor.front() == '@') { - auto const end = std::ranges::find_if(cursor, [](char ch) { + auto const end = std::ranges::find_if(cursor, [](char const ch) noexcept { return is_ascii_ws(ch) || ch == '.'; }); cursor.remove_prefix(static_cast(end - cursor.begin())); @@ -325,6 +314,95 @@ namespace rdf4cpp::parser { return cursor; } + static FormatGuess sniff_bracket_content(std::string_view const content, std::string_view const full_content) { + auto const after_bracket = ltrim_ascii_whitespace(content.substr(1)); + // JSON arrays start with `[` followed by `{`, `"`, `[`, number, true, false, null + // Turtle/TriG blank nodes: `[]` or `[ predicate object ]` + if (!after_bracket.empty() && (after_bracket.front() == ']' || after_bracket.front() == '<' || after_bracket.front() == '_')) { + // Likely Turtle/TriG blank node + if (has_trig_markers(full_content)) { + return {ParsingFlag::TriG, GuessConfidence::Medium}; + } + return {ParsingFlag::Turtle, GuessConfidence::Low}; + } + return sniff_json_content(content); + } + + static FormatGuess sniff_brace_content(std::string_view const content) { + auto const after_brace = ltrim_ascii_whitespace(content.substr(1)); + if (after_brace.empty() || after_brace.front() == '"') { + return sniff_json_content(content); + } + // Likely TriG — `{` followed by non-JSON content + return {ParsingFlag::TriG, GuessConfidence::Medium}; + } + + static FormatGuess sniff_turtle_or_trig_markers(std::string_view const content, std::string_view const full_content) { + static constexpr std::string_view turtle_markers = ";,()[]{}"; + bool has_turtle_marker = false; + bool in_iri = false; + bool in_string = false; + char string_delim = 0; + char prev_char = 0; + + auto cursor = content; + while (!cursor.empty()) { + char const c = cursor.front(); + cursor.remove_prefix(1); + if (in_string) { + if (c == '\\' && !cursor.empty()) { + prev_char = cursor.front(); + cursor.remove_prefix(1); + continue; + } + if (c == string_delim) { + in_string = false; + } + prev_char = c; + continue; + } + if (in_iri) { + if (c == '>') { + in_iri = false; + } + prev_char = c; + continue; + } + if (c == '<') { + in_iri = true; + prev_char = c; + continue; + } + if (c == '"' || c == '\'') { + in_string = true; + string_delim = c; + prev_char = c; + continue; + } + // Turtle/TriG syntax markers not valid in N-Triples + if (turtle_markers.find(c) != std::string_view::npos) { + has_turtle_marker = true; + break; + } + // bare `a` surrounded by whitespace = Turtle shorthand for rdf:type + if (c == 'a' && (prev_char == ' ' || prev_char == '\t') && !cursor.empty() && (cursor.front() == ' ' || cursor.front() == '\t')) + { + has_turtle_marker = true; + break; + } + prev_char = c; + } + + if (has_turtle_marker) { + if (has_trig_markers(full_content)) { + return {ParsingFlag::TriG, GuessConfidence::Low}; + } + return {ParsingFlag::Turtle, GuessConfidence::Low}; + } + + return {ParsingFlag::Auto, GuessConfidence::None}; + } + FormatGuess guess_format_from_content(std::string_view const prefix) noexcept { auto const full_content = skip_whitespace_and_bom(prefix); if (full_content.empty()) { @@ -347,25 +425,10 @@ namespace rdf4cpp::parser { // JSON-based formats — but `{` can also be a TriG default graph block, // and `[` can be a TriG blank node graph name or a Turtle blank node property list. if (content.front() == '[') { - auto const after_bracket = ltrim_ascii_whitespace(content.substr(1)); - // JSON arrays start with `[` followed by `{`, `"`, `[`, number, true, false, null - // Turtle/TriG blank nodes: `[]` or `[ predicate object ]` - if (!after_bracket.empty() && (after_bracket.front() == ']' || after_bracket.front() == '<' || after_bracket.front() == '_')) { - // Likely Turtle/TriG blank node - if (has_trig_markers(full_content)) { - return {ParsingFlag::TriG, GuessConfidence::Medium}; - } - return {ParsingFlag::Turtle, GuessConfidence::Low}; - } - return sniff_json_content(content); + return sniff_bracket_content(content, full_content); } if (content.front() == '{') { - auto const after_brace = ltrim_ascii_whitespace(content.substr(1)); - if (after_brace.empty() || after_brace.front() == '"') { - return sniff_json_content(content); - } - // Likely TriG — `{` followed by non-JSON content - return {ParsingFlag::TriG, GuessConfidence::Medium}; + return sniff_brace_content(content); } // Turtle directives (case-sensitive @prefix/@base) @@ -386,8 +449,7 @@ namespace rdf4cpp::parser { // Phase 2: try N-Triples / N-Quads line-based detection if (content.front() == '<' || content.starts_with("_:")) { - auto const result = sniff_ntriples_or_nquads(full_content); - if (result.is_known()) { + if (auto const result = sniff_ntriples_or_nquads(full_content); result.is_known()) { return result; } // If N-Triples detection failed, the content starts with `<` or `_:` @@ -395,73 +457,8 @@ namespace rdf4cpp::parser { } // Phase 3: check for Turtle/TriG syntax markers in content that didn't - // match any earlier patterns (e.g. Turtle without @prefix directives) - { - static constexpr std::string_view turtle_markers = ";,()[]{}"; - bool has_turtle_marker = false; - bool in_iri = false; - bool in_string = false; - char string_delim = 0; - char prev_char = 0; - - auto cursor = content; - while (!cursor.empty()) { - char const c = cursor.front(); - cursor.remove_prefix(1); - if (in_string) { - if (c == '\\' && !cursor.empty()) { - prev_char = cursor.front(); - cursor.remove_prefix(1); - continue; - } - if (c == string_delim) { - in_string = false; - } - prev_char = c; - continue; - } - if (in_iri) { - if (c == '>') { - in_iri = false; - } - prev_char = c; - continue; - } - if (c == '<') { - in_iri = true; - prev_char = c; - continue; - } - if (c == '"' || c == '\'') { - in_string = true; - string_delim = c; - prev_char = c; - continue; - } - // Turtle/TriG syntax markers not valid in N-Triples - if (turtle_markers.find(c) != std::string_view::npos) { - has_turtle_marker = true; - break; - } - // bare `a` surrounded by whitespace = Turtle shorthand for rdf:type - if (c == 'a' && (prev_char == ' ' || prev_char == '\t') && !cursor.empty() - && (cursor.front() == ' ' || cursor.front() == '\t')) - { - has_turtle_marker = true; - break; - } - prev_char = c; - } - - if (has_turtle_marker) { - if (has_trig_markers(full_content)) { - return {ParsingFlag::TriG, GuessConfidence::Low}; - } - return {ParsingFlag::Turtle, GuessConfidence::Low}; - } - } - - return {ParsingFlag::Auto, GuessConfidence::None}; + // match any earlier patterns (e.g., Turtle without @prefix directives) + return sniff_turtle_or_trig_markers(content, full_content); } FormatGuess guess_format(std::string_view const file_path, std::string_view const prefix) noexcept { @@ -474,11 +471,11 @@ namespace rdf4cpp::parser { if (content_guess.is_known() && content_guess.syntax == ext_guess.syntax) { return {ext_guess.syntax, GuessConfidence::Certain}; } - // Extension is high confidence — trust it even if content is ambiguous + // Extension is high confidence — trust it even if the content is ambiguous return ext_guess; } - // Low confidence extension (e.g. .owl, .xml) — need content disambiguation + // Low confidence extension (e.g., .owl, .xml) — need content disambiguation if (ext_guess.confidence == GuessConfidence::Low) { if (content_guess.is_known()) { // Content overrides ambiguous extension diff --git a/src/rdf4cpp/parser/FormatGuess.hpp b/src/rdf4cpp/parser/FormatGuess.hpp index e56b18480..8a17223d4 100644 --- a/src/rdf4cpp/parser/FormatGuess.hpp +++ b/src/rdf4cpp/parser/FormatGuess.hpp @@ -13,7 +13,7 @@ namespace rdf4cpp::parser { */ enum struct GuessConfidence : uint8_t { None = 0, ///< no guess could be made - Low, ///< weak heuristic match (e.g. ambiguous extension like .owl) + Low, ///< weak heuristic match (e.g., ambiguous extension like .owl) Medium, ///< content sniffing with good signal High, ///< file extension match or strong content match Certain, ///< unambiguous (extension + content agree) diff --git a/tests/bench_SerDe.cpp b/tests/bench_SerDe.cpp index a221b5145..e620a059a 100644 --- a/tests/bench_SerDe.cpp +++ b/tests/bench_SerDe.cpp @@ -6,7 +6,7 @@ #include void download_swdf(std::filesystem::path const &base) { - auto curl_cmd = std::format("wget -P '{}' https://hobbitdata.informatik.uni-leipzig.de/ISWC2020_Tentris/swdf.zip", base.c_str()); + auto curl_cmd = std::format("wget -P '{}' https://files.dice-research.org/datasets/ISWC2020_Tentris/swdf.zip", base.c_str()); std::system(curl_cmd.c_str()); auto const swdf_path = base / "swdf.zip"; From 9cc710d8de7a2db045f524ce27bc3a82a1441cfe Mon Sep 17 00:00:00 2001 From: bigerl Date: Sun, 1 Mar 2026 21:07:49 +0100 Subject: [PATCH 5/7] use filesystem --- src/rdf4cpp/parser/FormatGuess.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/rdf4cpp/parser/FormatGuess.cpp b/src/rdf4cpp/parser/FormatGuess.cpp index 7261297fb..955275bd4 100644 --- a/src/rdf4cpp/parser/FormatGuess.cpp +++ b/src/rdf4cpp/parser/FormatGuess.cpp @@ -1,6 +1,7 @@ #include "FormatGuess.hpp" #include +#include #include #include @@ -103,17 +104,11 @@ namespace rdf4cpp::parser { } FormatGuess guess_format_from_path(std::string_view const file_path) noexcept { - // find last path separator - auto const last_sep = file_path.find_last_of("/\\"); - auto const filename = (last_sep != std::string_view::npos) ? file_path.substr(last_sep + 1) : file_path; - - // find the last dot in the filename - auto const dot_pos = filename.rfind('.'); - if (dot_pos == std::string_view::npos) { + auto const ext = std::filesystem::path{file_path}.extension().string(); + if (ext.empty()) { return {ParsingFlag::Auto, GuessConfidence::None}; } - - return guess_format_from_extension(filename.substr(dot_pos)); + return guess_format_from_extension(ext); } // --- content sniffing --- From b802941ad79ae046b0c48065505030555b538a73 Mon Sep 17 00:00:00 2001 From: bigerl Date: Mon, 2 Mar 2026 08:59:31 +0100 Subject: [PATCH 6/7] add missing file --- .../rdf4cpp/parser/PrefixBufferedReader.hpp | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 private/rdf4cpp/parser/PrefixBufferedReader.hpp diff --git a/private/rdf4cpp/parser/PrefixBufferedReader.hpp b/private/rdf4cpp/parser/PrefixBufferedReader.hpp new file mode 100644 index 000000000..c4f10aa75 --- /dev/null +++ b/private/rdf4cpp/parser/PrefixBufferedReader.hpp @@ -0,0 +1,79 @@ +#ifndef RDF4CPP_PARSER_PRIVATE_PREFIXBUFFEREDREADER_HPP +#define RDF4CPP_PARSER_PRIVATE_PREFIXBUFFEREDREADER_HPP + +#include +#include +#include +#include + +#include + +namespace rdf4cpp::parser { + + /** + * Wraps a C-like IO stream (void*, ReadFunc, ErrorFunc, EOFFunc) and + * serves a buffered prefix first, then delegates to the underlying stream. + * + * This is used when we peek at the start of a stream for content sniffing + * but need to replay those bytes for the actual parser. + */ + struct PrefixBufferedReader { + void *underlying_stream; + ReadFunc underlying_read; + ErrorFunc underlying_error; + EOFFunc underlying_eof; + + std::vector prefix_buf; + size_t prefix_offset = 0; + + PrefixBufferedReader(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof, std::vector prefix) + : underlying_stream{stream}, + underlying_read{read}, + underlying_error{error}, + underlying_eof{eof}, + prefix_buf{std::move(prefix)} { + } + + static size_t read_func(void *buffer, size_t elem_size, size_t count, void *voided_self) noexcept { + auto *self = static_cast(voided_self); + auto *buf = static_cast(buffer); + size_t total_bytes = elem_size * count; + size_t bytes_read = 0; + + // serve from the prefix buffer first + size_t const prefix_remaining = self->prefix_buf.size() - self->prefix_offset; + if (prefix_remaining > 0) { + size_t const from_prefix = std::min(total_bytes, prefix_remaining); + std::memcpy(buf, self->prefix_buf.data() + self->prefix_offset, from_prefix); + self->prefix_offset += from_prefix; + bytes_read += from_prefix; + total_bytes -= from_prefix; + buf += from_prefix; + } + + // delegate remaining to the underlying stream + if (total_bytes > 0) { + bytes_read += self->underlying_read(buf, 1, total_bytes, self->underlying_stream); + } + + return bytes_read; + } + + static int error_func(void *voided_self) noexcept { + auto const *self = static_cast(voided_self); + return self->underlying_error(self->underlying_stream); + } + + static int eof_func(void *voided_self) noexcept { + auto const *self = static_cast(voided_self); + // not at eof if we still have buffered prefix data + if (self->prefix_offset < self->prefix_buf.size()) { + return 0; + } + return self->underlying_eof(self->underlying_stream); + } + }; + +} // namespace rdf4cpp::parser + +#endif // RDF4CPP_PARSER_PRIVATE_PREFIXBUFFEREDREADER_HPP From b6c3435b4578757bfa9f2d0740a08637c6a68501 Mon Sep 17 00:00:00 2001 From: bigerl Date: Mon, 2 Mar 2026 10:27:14 +0100 Subject: [PATCH 7/7] fix leak --- private/rdf4cpp/parser/XMLParser.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp index bf1e38d48..93af65643 100644 --- a/private/rdf4cpp/parser/XMLParser.hpp +++ b/private/rdf4cpp/parser/XMLParser.hpp @@ -32,6 +32,10 @@ namespace rdf4cpp::parser { // see https://github.com/NVIDIA/stdexec/issues/1143 struct XmlParserCtxtDtorLambda { void operator()(xmlParserCtxt *c) const { + if (c != nullptr && c->myDoc != nullptr) { + xmlFreeDoc(c->myDoc); + c->myDoc = nullptr; + } xmlFreeParserCtxt(c); } };