From 5143031af3c8def7482b2f516983826f3b4f701a Mon Sep 17 00:00:00 2001
From: bigerl <alexander@bigerl.eu>
Date: Sun, 1 Mar 2026 19:18:34 +0100
Subject: [PATCH 1/7] initial implementation

---
 CMakeLists.txt                                |   3 +-
 .../parser/IStreamQuadIteratorSerdImpl.hpp    |   5 +-
 src/rdf4cpp/parser/FormatGuess.cpp            | 498 ++++++++++++++++++
 src/rdf4cpp/parser/FormatGuess.hpp            |  64 +++
 src/rdf4cpp/parser/IStreamQuadIterator.cpp    |  67 ++-
 src/rdf4cpp/parser/IStreamQuadIterator.hpp    |  11 +
 src/rdf4cpp/parser/ParsingFlags.hpp           |  30 +-
 src/rdf4cpp/parser/RDFFileParser.cpp          |  49 +-
 src/rdf4cpp/parser/RDFFileParser.hpp          |   2 +
 tests/CMakeLists.txt                          |  15 +
 tests/parser/tests_FormatGuess.cpp            | 457 ++++++++++++++++
 tests/parser/tests_FormatGuess_realworld.cpp  | 322 +++++++++++
 tests/parser/tests_IStreamQuadIterator.cpp    |   2 +-
 13 files changed, 1506 insertions(+), 19 deletions(-)
 create mode 100644 src/rdf4cpp/parser/FormatGuess.cpp
 create mode 100644 src/rdf4cpp/parser/FormatGuess.hpp
 create mode 100644 tests/parser/tests_FormatGuess.cpp
 create mode 100644 tests/parser/tests_FormatGuess_realworld.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f8cfb4a9..988aeddb1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.22)
-project(rdf4cpp VERSION 0.1.13)
+project(rdf4cpp VERSION 0.2.0)
 set(POBR_VERSION 3)  # Persisted Object Binary Representation
 
 include(cmake/boilerplate_init.cmake)
@@ -129,6 +129,7 @@ add_library(rdf4cpp
         src/rdf4cpp/datatypes/xsd/time/DayTimeDuration.cpp
         src/rdf4cpp/datatypes/xsd/time/YearMonthDuration.cpp
         src/rdf4cpp/namespaces/RDF.cpp
+        src/rdf4cpp/parser/FormatGuess.cpp
         src/rdf4cpp/parser/IStreamQuadIterator.cpp
         src/rdf4cpp/parser/RDFFileParser.cpp
         src/rdf4cpp/query/QuadPattern.cpp
diff --git a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp
index 21b4bd7ab..6543deda4 100644
--- a/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp
+++ b/private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp
@@ -57,6 +57,8 @@ struct IStreamQuadIterator::ImplSerd final : Impl {
 
     static constexpr SerdSyntax extract_syntax_from_flags(ParsingFlags flags) noexcept {
         switch (flags.get_syntax()) {
+            case ParsingFlag::Turtle:
+                return SerdSyntax::SERD_TURTLE;
             case ParsingFlag::TriG:
                 return SerdSyntax::SERD_TRIG;
             case ParsingFlag::NTriples:
@@ -64,7 +66,8 @@ struct IStreamQuadIterator::ImplSerd final : Impl {
             case ParsingFlag::NQuads:
                 return SerdSyntax::SERD_NQUADS;
             default:
-                return SerdSyntax::SERD_TURTLE;
+                // Auto, RdfXml, OwlXml, JsonLd should be resolved before reaching here
+                RDF4CPP_UNREACHABLE;
         }
     }
 
diff --git a/src/rdf4cpp/parser/FormatGuess.cpp b/src/rdf4cpp/parser/FormatGuess.cpp
new file mode 100644
index 000000000..c907d69e8
--- /dev/null
+++ b/src/rdf4cpp/parser/FormatGuess.cpp
@@ -0,0 +1,498 @@
+#include "FormatGuess.hpp"
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+
+namespace rdf4cpp::parser {
+
+    // --- helpers ---
+
+    static std::string to_lower(std::string_view sv) {
+        std::string s{sv};
+        std::ranges::transform(s, s.begin(), [](unsigned char c) {
+            return std::tolower(c);
+        });
+        return s;
+    }
+
+    static std::string_view skip_whitespace_and_bom(std::string_view sv) {
+        // skip UTF-8 BOM
+        if (sv.size() >= 3 && sv[0] == '\xEF' && sv[1] == '\xBB' && sv[2] == '\xBF') {
+            sv.remove_prefix(3);
+        }
+        // skip leading whitespace
+        while (!sv.empty() && (sv.front() == ' ' || sv.front() == '\t' || sv.front() == '\n' || sv.front() == '\r')) {
+            sv.remove_prefix(1);
+        }
+        return sv;
+    }
+
+    static bool starts_with_icase(std::string_view haystack, std::string_view needle) {
+        if (haystack.size() < needle.size()) {
+            return false;
+        }
+        for (size_t i = 0; i < needle.size(); ++i) {
+            if (std::tolower(static_cast<unsigned char>(haystack[i])) != std::tolower(static_cast<unsigned char>(needle[i]))) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    static bool contains(std::string_view haystack, std::string_view needle) {
+        return haystack.find(needle) != std::string_view::npos;
+    }
+
+    static bool contains_icase(std::string_view haystack, std::string_view needle) {
+        if (needle.size() > haystack.size()) {
+            return false;
+        }
+        auto lower_hay = to_lower(haystack);
+        auto lower_needle = to_lower(needle);
+        return lower_hay.find(lower_needle) != std::string::npos;
+    }
+
+    // --- extension mapping ---
+
+    FormatGuess guess_format_from_extension(std::string_view extension) noexcept {
+        auto ext = to_lower(extension);
+
+        if (ext == ".ttl" || ext == ".turtle") {
+            return {ParsingFlag::Turtle, GuessConfidence::High};
+        }
+        if (ext == ".nt" || ext == ".ntriples") {
+            return {ParsingFlag::NTriples, GuessConfidence::High};
+        }
+        if (ext == ".nq" || ext == ".nquads") {
+            return {ParsingFlag::NQuads, GuessConfidence::High};
+        }
+        if (ext == ".trig") {
+            return {ParsingFlag::TriG, GuessConfidence::High};
+        }
+        if (ext == ".rdf") {
+            return {ParsingFlag::RdfXml, GuessConfidence::High};
+        }
+        if (ext == ".owx") {
+            return {ParsingFlag::OwlXml, GuessConfidence::High};
+        }
+        if (ext == ".jsonld") {
+            return {ParsingFlag::JsonLd, GuessConfidence::High};
+        }
+
+        // ambiguous extensions — need content sniffing
+        if (ext == ".owl" || ext == ".xml") {
+            return {ParsingFlag::RdfXml, GuessConfidence::Low};
+        }
+
+        return {ParsingFlag::Auto, GuessConfidence::None};
+    }
+
+    FormatGuess guess_format_from_path(std::string_view file_path) noexcept {
+        // find last path separator
+        auto const last_sep = file_path.find_last_of("/\\");
+        auto const filename = (last_sep != std::string_view::npos) ? file_path.substr(last_sep + 1) : file_path;
+
+        // find last dot in filename
+        auto const dot_pos = filename.rfind('.');
+        if (dot_pos == std::string_view::npos) {
+            return {ParsingFlag::Auto, GuessConfidence::None};
+        }
+
+        return guess_format_from_extension(filename.substr(dot_pos));
+    }
+
+    // --- content sniffing ---
+
+    static bool has_trig_markers(std::string_view content) {
+        // Look for GRAPH keyword or { } blocks outside of string literals
+        // Simple heuristic: look for GRAPH keyword or standalone { not inside quotes
+        if (contains_icase(content, "GRAPH")) {
+            return true;
+        }
+
+        // Look for pattern like IRI/prefixed-name followed by {
+        // or just { at start of a line (default graph block)
+        bool in_string = false;
+        char string_delim = 0;
+        for (size_t i = 0; i < content.size(); ++i) {
+            char c = content[i];
+            if (in_string) {
+                if (c == '\\' && i + 1 < content.size()) {
+                    ++i;  // skip escaped char
+                    continue;
+                }
+                if (c == string_delim) {
+                    in_string = false;
+                }
+                continue;
+            }
+            if (c == '"' || c == '\'') {
+                in_string = true;
+                string_delim = c;
+                continue;
+            }
+            if (c == '{') {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static FormatGuess sniff_xml_content(std::string_view content) {
+        // Check for OWL/XML first — more specific markers.
+        // OWL/XML uses <Ontology> root element (not <rdf:RDF>) and may still
+        // declare xmlns:rdf as a namespace prefix, so checking OWL/XML before
+        // RDF/XML avoids false positives.
+        bool has_ontology_root = contains(content, "<Ontology");
+        bool has_owl_ns = contains(content, "xmlns=\"http://www.w3.org/2002/07/owl#\"");
+        bool has_rdf_root = contains(content, "<rdf:RDF");
+        bool has_rdf_desc = contains(content, "<rdf:Description");
+
+        if (has_ontology_root && !has_rdf_root) {
+            return {ParsingFlag::OwlXml, GuessConfidence::High};
+        }
+        if (has_owl_ns && !has_rdf_root && !has_rdf_desc) {
+            return {ParsingFlag::OwlXml, GuessConfidence::High};
+        }
+        if (has_rdf_root || has_rdf_desc) {
+            return {ParsingFlag::RdfXml, GuessConfidence::High};
+        }
+        if (contains(content, "xmlns:rdf=")) {
+            return {ParsingFlag::RdfXml, GuessConfidence::Medium};
+        }
+        // Generic XML — could be RDF/XML, return low-confidence guess
+        return {ParsingFlag::RdfXml, GuessConfidence::Low};
+    }
+
+    static FormatGuess sniff_json_content(std::string_view content) {
+        if (contains(content, "\"@context\"") || contains(content, "\"@id\"") || contains(content, "\"@graph\"")) {
+            return {ParsingFlag::JsonLd, GuessConfidence::High};
+        }
+        return {ParsingFlag::Auto, GuessConfidence::None};
+    }
+
+    static FormatGuess sniff_ntriples_or_nquads(std::string_view content) {
+        // Scan lines looking for N-Triples/N-Quads patterns:
+        // Lines of <iri> <iri> <obj> . (3 terms = NT, 4 terms = NQ)
+        bool found_4_terms = false;
+        bool found_any_triple = false;
+
+        size_t pos = 0;
+        while (pos < content.size()) {
+            // find end of line
+            auto eol = content.find('\n', pos);
+            auto line = content.substr(pos, eol == std::string_view::npos ? std::string_view::npos : eol - pos);
+            pos = (eol == std::string_view::npos) ? content.size() : eol + 1;
+
+            // trim
+            while (!line.empty() && (line.front() == ' ' || line.front() == '\t')) {
+                line.remove_prefix(1);
+            }
+            while (!line.empty() && (line.back() == ' ' || line.back() == '\t' || line.back() == '\r')) {
+                line.remove_suffix(1);
+            }
+
+            // skip empty lines and comments
+            if (line.empty() || line.front() == '#') {
+                continue;
+            }
+
+            // Count terms by walking through the line, handling # comments
+            // that appear outside of IRIs and literals correctly.
+            int term_count = 0;
+            bool found_dot = false;
+            size_t i = 0;
+            while (i < line.size()) {
+                // skip whitespace
+                while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) {
+                    ++i;
+                }
+                if (i >= line.size()) {
+                    break;
+                }
+                char c = line[i];
+
+                if (c == '.') {
+                    found_dot = true;
+                    ++i;
+                    // skip trailing whitespace and optional comment after .
+                    while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) {
+                        ++i;
+                    }
+                    if (i < line.size() && line[i] == '#') {
+                        // inline comment after dot — valid
+                    }
+                    break;
+                } else if (c == '#') {
+                    // comment at top level (outside IRI/literal) — not valid N-Triples
+                    // unless we already found the dot
+                    return {ParsingFlag::Auto, GuessConfidence::None};
+                } else if (c == '<') {
+                    // IRI — find closing >
+                    auto close = line.find('>', i);
+                    if (close == std::string_view::npos) {
+                        break;
+                    }
+                    i = close + 1;
+                    ++term_count;
+                } else if (c == '_' && i + 1 < line.size() && line[i + 1] == ':') {
+                    // blank node — skip to next whitespace
+                    while (i < line.size() && line[i] != ' ' && line[i] != '\t') {
+                        ++i;
+                    }
+                    ++term_count;
+                } else if (c == '"') {
+                    // literal — find unescaped closing quote, then skip datatype/lang
+                    ++i;
+                    while (i < line.size()) {
+                        if (line[i] == '\\') {
+                            i += 2;
+                            continue;
+                        }
+                        if (line[i] == '"') {
+                            break;
+                        }
+                        ++i;
+                    }
+                    if (i < line.size()) {
+                        ++i;  // skip closing quote
+                    }
+                    // skip ^^<datatype> or @lang (which may contain # inside <...>)
+                    if (i + 1 < line.size() && line[i] == '^' && line[i + 1] == '^') {
+                        i += 2;
+                        if (i < line.size() && line[i] == '<') {
+                            auto close = line.find('>', i);
+                            if (close != std::string_view::npos) {
+                                i = close + 1;
+                            }
+                        } else {
+                            while (i < line.size() && line[i] != ' ' && line[i] != '\t' && line[i] != '.') {
+                                ++i;
+                            }
+                        }
+                    } else if (i < line.size() && line[i] == '@') {
+                        while (i < line.size() && line[i] != ' ' && line[i] != '\t' && line[i] != '.') {
+                            ++i;
+                        }
+                    }
+                    ++term_count;
+                } else {
+                    // unexpected character for N-Triples/N-Quads
+                    return {ParsingFlag::Auto, GuessConfidence::None};
+                }
+            }
+
+            if (!found_dot) {
+                return {ParsingFlag::Auto, GuessConfidence::None};
+            }
+            if (term_count == 4) {
+                found_4_terms = true;
+            }
+            if (term_count >= 3) {
+                found_any_triple = true;
+            }
+            if (term_count < 3 || term_count > 4) {
+                return {ParsingFlag::Auto, GuessConfidence::None};
+            }
+        }
+
+        if (!found_any_triple) {
+            return {ParsingFlag::Auto, GuessConfidence::None};
+        }
+        if (found_4_terms) {
+            return {ParsingFlag::NQuads, GuessConfidence::Medium};
+        }
+        return {ParsingFlag::NTriples, GuessConfidence::Medium};
+    }
+
+    static std::string_view skip_comments(std::string_view sv) {
+        // skip lines starting with # (comments in N-Triples/Turtle/TriG)
+        while (!sv.empty() && sv.front() == '#') {
+            auto eol = sv.find('\n');
+            if (eol == std::string_view::npos) {
+                return {};
+            }
+            sv.remove_prefix(eol + 1);
+            // skip whitespace after comment line
+            while (!sv.empty() && (sv.front() == ' ' || sv.front() == '\t' || sv.front() == '\n' || sv.front() == '\r')) {
+                sv.remove_prefix(1);
+            }
+        }
+        return sv;
+    }
+
+    FormatGuess guess_format_from_content(std::string_view prefix) noexcept {
+        auto full_content = skip_whitespace_and_bom(prefix);
+        if (full_content.empty()) {
+            return {ParsingFlag::Auto, GuessConfidence::None};
+        }
+
+        // Skip leading comment lines for the first-byte checks
+        auto content = skip_comments(full_content);
+        if (content.empty()) {
+            return {ParsingFlag::Auto, GuessConfidence::None};
+        }
+
+        // Phase 1: deterministic checks
+
+        // XML-based formats
+        if (content.starts_with("<?xml") || content.starts_with("<rdf:RDF") || content.starts_with("<rdf:")) {
+            return sniff_xml_content(content);
+        }
+
+        // JSON-based formats — but `{` can also be a TriG default graph block,
+        // and `[` can be a TriG blank node graph name or a Turtle blank node property list.
+        if (content.front() == '[') {
+            auto after_bracket = content.substr(1);
+            while (!after_bracket.empty()
+                   && (after_bracket.front() == ' ' || after_bracket.front() == '\t' || after_bracket.front() == '\n'
+                       || after_bracket.front() == '\r'))
+            {
+                after_bracket.remove_prefix(1);
+            }
+            // JSON arrays start with `[` followed by `{`, `"`, `[`, number, true, false, null
+            // Turtle/TriG blank nodes: `[]` or `[ predicate object ]`
+            if (!after_bracket.empty() && (after_bracket.front() == ']' || after_bracket.front() == '<' || after_bracket.front() == '_')) {
+                // Likely Turtle/TriG blank node
+                if (has_trig_markers(full_content)) {
+                    return {ParsingFlag::TriG, GuessConfidence::Medium};
+                }
+                return {ParsingFlag::Turtle, GuessConfidence::Low};
+            }
+            return sniff_json_content(content);
+        }
+        if (content.front() == '{') {
+            auto after_brace = content.substr(1);
+            while (!after_brace.empty()
+                   && (after_brace.front() == ' ' || after_brace.front() == '\t' || after_brace.front() == '\n'
+                       || after_brace.front() == '\r'))
+            {
+                after_brace.remove_prefix(1);
+            }
+            if (after_brace.empty() || after_brace.front() == '"') {
+                return sniff_json_content(content);
+            }
+            // Likely TriG — `{` followed by non-JSON content
+            return {ParsingFlag::TriG, GuessConfidence::Medium};
+        }
+
+        // Turtle directives (case-sensitive @prefix/@base)
+        if (content.starts_with("@prefix") || content.starts_with("@base")) {
+            if (has_trig_markers(full_content)) {
+                return {ParsingFlag::TriG, GuessConfidence::Medium};
+            }
+            return {ParsingFlag::Turtle, GuessConfidence::High};
+        }
+
+        // SPARQL-style PREFIX/BASE (case-insensitive)
+        if (starts_with_icase(content, "PREFIX") || starts_with_icase(content, "BASE")) {
+            if (has_trig_markers(full_content)) {
+                return {ParsingFlag::TriG, GuessConfidence::Medium};
+            }
+            return {ParsingFlag::Turtle, GuessConfidence::Medium};
+        }
+
+        // Phase 2: try N-Triples / N-Quads line-based detection
+        if (content.front() == '<' || (content.front() == '_' && content.size() > 1 && content[1] == ':')) {
+            auto result = sniff_ntriples_or_nquads(full_content);
+            if (result.is_known()) {
+                return result;
+            }
+            // If N-Triples detection failed, the content starts with `<` or `_:`
+            // which are valid Turtle starts too — fall through to Phase 3
+        }
+
+        // Phase 3: check for Turtle/TriG syntax markers in content that didn't
+        // match any earlier patterns (e.g. Turtle without @prefix directives)
+        {
+            bool has_turtle_marker = false;
+            bool in_iri = false;
+            bool in_string = false;
+            char string_delim = 0;
+
+            for (size_t i = 0; i < content.size(); ++i) {
+                char c = content[i];
+                if (in_string) {
+                    if (c == '\\' && i + 1 < content.size()) {
+                        ++i;
+                        continue;
+                    }
+                    if (c == string_delim) {
+                        in_string = false;
+                    }
+                    continue;
+                }
+                if (in_iri) {
+                    if (c == '>') {
+                        in_iri = false;
+                    }
+                    continue;
+                }
+                if (c == '<') {
+                    in_iri = true;
+                    continue;
+                }
+                if (c == '"' || c == '\'') {
+                    in_string = true;
+                    string_delim = c;
+                    continue;
+                }
+                // Turtle/TriG syntax markers not valid in N-Triples
+                if (c == ';' || c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}') {
+                    has_turtle_marker = true;
+                    break;
+                }
+                // bare `a` surrounded by whitespace = Turtle shorthand for rdf:type
+                if (c == 'a' && i > 0 && (content[i - 1] == ' ' || content[i - 1] == '\t') && i + 1 < content.size()
+                    && (content[i + 1] == ' ' || content[i + 1] == '\t'))
+                {
+                    has_turtle_marker = true;
+                    break;
+                }
+            }
+
+            if (has_turtle_marker) {
+                if (has_trig_markers(full_content)) {
+                    return {ParsingFlag::TriG, GuessConfidence::Low};
+                }
+                return {ParsingFlag::Turtle, GuessConfidence::Low};
+            }
+        }
+
+        return {ParsingFlag::Auto, GuessConfidence::None};
+    }
+
+    FormatGuess guess_format(std::string_view file_path, std::string_view prefix) noexcept {
+        auto ext_guess = guess_format_from_path(file_path);
+        auto content_guess = guess_format_from_content(prefix);
+
+        // If extension gives a strong match and no content sniffing needed
+        if (ext_guess.confidence == GuessConfidence::High) {
+            // Check if content agrees for Certain confidence
+            if (content_guess.is_known() && content_guess.syntax == ext_guess.syntax) {
+                return {ext_guess.syntax, GuessConfidence::Certain};
+            }
+            // Extension is high confidence — trust it even if content is ambiguous
+            return ext_guess;
+        }
+
+        // Low confidence extension (e.g. .owl, .xml) — need content disambiguation
+        if (ext_guess.confidence == GuessConfidence::Low) {
+            if (content_guess.is_known()) {
+                // Content overrides ambiguous extension
+                return content_guess;
+            }
+            // Content inconclusive, use extension guess
+            return ext_guess;
+        }
+
+        // No extension match — rely on content
+        if (content_guess.is_known()) {
+            return content_guess;
+        }
+
+        return {ParsingFlag::Auto, GuessConfidence::None};
+    }
+
+}  // namespace rdf4cpp::parser
diff --git a/src/rdf4cpp/parser/FormatGuess.hpp b/src/rdf4cpp/parser/FormatGuess.hpp
new file mode 100644
index 000000000..268734e7d
--- /dev/null
+++ b/src/rdf4cpp/parser/FormatGuess.hpp
@@ -0,0 +1,64 @@
+#ifndef RDF4CPP_PARSER_FORMATGUESS_HPP
+#define RDF4CPP_PARSER_FORMATGUESS_HPP
+
+#include <cstdint>
+#include <string_view>
+
+#include <rdf4cpp/parser/ParsingFlags.hpp>
+
+namespace rdf4cpp::parser {
+
+    /**
+     * Confidence level for an RDF format guess, ordered from least to most confident.
+     */
+    enum struct GuessConfidence : uint8_t {
+        None = 0,  ///< no guess could be made
+        Low,       ///< weak heuristic match (e.g. ambiguous extension like .owl)
+        Medium,    ///< content sniffing with good signal
+        High,      ///< file extension match or strong content match
+        Certain,   ///< unambiguous (extension + content agree)
+    };
+
+    /**
+     * Result of an RDF serialization format guess, combining the detected syntax with
+     * a confidence level indicating how reliable the guess is.
+     */
+    struct FormatGuess {
+        ParsingFlag syntax = ParsingFlag::Auto;
+        GuessConfidence confidence = GuessConfidence::None;
+
+        [[nodiscard]] constexpr bool is_known() const noexcept {
+            return syntax != ParsingFlag::Auto && confidence != GuessConfidence::None;
+        }
+
+        [[nodiscard]] constexpr explicit operator bool() const noexcept {
+            return is_known();
+        }
+    };
+
+    /**
+     * Guess the RDF serialization format from a file extension (including the dot).
+     * Case-insensitive. Returns {Auto, None} for unrecognized extensions.
+     */
+    [[nodiscard]] FormatGuess guess_format_from_extension(std::string_view extension) noexcept;
+
+    /**
+     * Extract the file extension from a path and guess the format.
+     */
+    [[nodiscard]] FormatGuess guess_format_from_path(std::string_view file_path) noexcept;
+
+    /**
+     * Guess the RDF serialization format by inspecting a prefix of the file content.
+     * At least 512 bytes recommended, 4096 bytes ideal.
+     */
+    [[nodiscard]] FormatGuess guess_format_from_content(std::string_view prefix) noexcept;
+
+    /**
+     * Combined guess: extension first, content second.
+     * Confidence boosted to Certain when both agree.
+     */
+    [[nodiscard]] FormatGuess guess_format(std::string_view file_path, std::string_view prefix) noexcept;
+
+}  // namespace rdf4cpp::parser
+
+#endif  // RDF4CPP_PARSER_FORMATGUESS_HPP
diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.cpp b/src/rdf4cpp/parser/IStreamQuadIterator.cpp
index 1abc99c91..087e58f56 100644
--- a/src/rdf4cpp/parser/IStreamQuadIterator.cpp
+++ b/src/rdf4cpp/parser/IStreamQuadIterator.cpp
@@ -1,9 +1,12 @@
 #include "IStreamQuadIterator.hpp"
 
 #include <rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp>
+#include <rdf4cpp/parser/PrefixBufferedReader.hpp>
 #include <rdf4cpp/parser/XMLParser.hpp>
 
 #include <cstdio>
+#include <stdexcept>
+#include <vector>
 
 #if __has_include(<fcntl.h>)
 #include <fcntl.h>
@@ -11,6 +14,8 @@
 
 namespace rdf4cpp::parser {
 
+static constexpr size_t peek_size = 4096;
+
 /**
  * Adaptor function so that serd can read from std::istreams.
  * Matches the interface of SerdSource/fread
@@ -51,16 +56,64 @@ static int istream_eof(void *voided_self) noexcept {
     return static_cast<int>(self->eof());
 }
 
+static void throw_if_unsupported(ParsingFlag syntax) {
+    if (syntax == ParsingFlag::OwlXml) {
+        throw std::runtime_error("OWL/XML format is not supported. Please convert to RDF/XML or Turtle.");
+    }
+    if (syntax == ParsingFlag::JsonLd) {
+        throw std::runtime_error("JSON-LD format is not supported.");
+    }
+}
+
+static ParsingFlag resolve_auto_syntax(FormatGuess guess) {
+    if (guess.is_known()) {
+        return guess.syntax;
+    }
+    // fallback to Turtle when we can't determine the format
+    return ParsingFlag::Turtle;
+}
+
 IStreamQuadIterator::IStreamQuadIterator(void *stream,
                                          ReadFunc read,
                                          ErrorFunc error,
                                          EOFFunc eof,
                                          flags_type flags,
-                                         state_type *state)
-    : impl{flags.get_syntax() == ParsingFlag::RdfXml ?
-        static_cast<std::unique_ptr<Impl>>(std::make_unique<ImplXML>(stream, read, error, eof, state)) :
-        std::make_unique<ImplSerd>(stream, read, error, flags, state)},
-      cur{impl->next()} {
+                                         state_type *state) {
+    auto make_impl = [](void *s, ReadFunc r, ErrorFunc e, EOFFunc ef,
+                        flags_type f, state_type *st) -> std::unique_ptr<Impl> {
+        if (f.get_syntax() == ParsingFlag::RdfXml) {
+            return std::make_unique<ImplXML>(s, r, e, ef, st);
+        }
+        return std::make_unique<ImplSerd>(s, r, e, f, st);
+    };
+
+    if (flags.get_syntax() == ParsingFlag::Auto) {
+        // Peek bytes for content sniffing
+        std::vector<char> buf(peek_size);
+        size_t const bytes_read = read(buf.data(), 1, peek_size, stream);
+        buf.resize(bytes_read);
+
+        std::string_view const prefix{buf.data(), buf.size()};
+        auto const guess = guess_format_from_content(prefix);
+        auto const resolved = resolve_auto_syntax(guess);
+        throw_if_unsupported(resolved);
+
+        detected_format_ = guess;
+        auto const resolved_flags = flags.with_syntax(resolved);
+
+        // Create a PrefixBufferedReader to replay peeked bytes
+        buffered_reader_ = std::make_unique<PrefixBufferedReader>(stream, read, error, eof, std::move(buf));
+        impl = make_impl(buffered_reader_.get(),
+                         &PrefixBufferedReader::read_func,
+                         &PrefixBufferedReader::error_func,
+                         &PrefixBufferedReader::eof_func,
+                         resolved_flags, state);
+    } else {
+        throw_if_unsupported(flags.get_syntax());
+        detected_format_ = FormatGuess{flags.get_syntax(), GuessConfidence::Certain};
+        impl = make_impl(stream, read, error, eof, flags, state);
+    }
+    cur = impl->next();
 }
 
 IStreamQuadIterator::IStreamQuadIterator(std::istream &istream,
@@ -95,6 +148,10 @@ uint64_t IStreamQuadIterator::current_column() const noexcept {
     return impl->current_column();
 }
 
+FormatGuess IStreamQuadIterator::detected_format() const noexcept {
+    return detected_format_;
+}
+
 bool IStreamQuadIterator::operator==(std::default_sentinel_t) const noexcept {
     return !cur.has_value();
 }
diff --git a/src/rdf4cpp/parser/IStreamQuadIterator.hpp b/src/rdf4cpp/parser/IStreamQuadIterator.hpp
index 47148b070..0ed2a0e3b 100644
--- a/src/rdf4cpp/parser/IStreamQuadIterator.hpp
+++ b/src/rdf4cpp/parser/IStreamQuadIterator.hpp
@@ -8,6 +8,7 @@
 
 #include <rdf4cpp/Quad.hpp>
 
+#include <rdf4cpp/parser/FormatGuess.hpp>
 #include <rdf4cpp/parser/ParsingError.hpp>
 #include <rdf4cpp/parser/ParsingFlags.hpp>
 #include <rdf4cpp/parser/ParsingState.hpp>
@@ -15,6 +16,8 @@
 
 namespace rdf4cpp::parser {
 
+struct PrefixBufferedReader;
+
 /**
  * Identical semantics to fread.
  * Uses stream to read at most count elements of size element_size into buffer.
@@ -107,6 +110,9 @@ struct IStreamQuadIterator {
 
     std::unique_ptr<Impl> impl;
     std::optional<nonstd::expected<ok_type, error_type>> cur;
+    FormatGuess detected_format_{};
+    std::unique_ptr<PrefixBufferedReader> buffered_reader_;
+
 public:
     /**
      * Constructs a IStreamQuadIterator from a C-like io api. That is something similar to
@@ -155,6 +161,11 @@ struct IStreamQuadIterator {
     [[nodiscard]] uint64_t current_line() const noexcept;
     [[nodiscard]] uint64_t current_column() const noexcept;
 
+    /**
+     * @return the detected format when Auto mode was used, or the explicitly set format
+     */
+    [[nodiscard]] FormatGuess detected_format() const noexcept;
+
     bool operator==(std::default_sentinel_t) const noexcept;
     bool operator!=(std::default_sentinel_t) const noexcept;
 };
diff --git a/src/rdf4cpp/parser/ParsingFlags.hpp b/src/rdf4cpp/parser/ParsingFlags.hpp
index 6da05a64e..02eb86a99 100644
--- a/src/rdf4cpp/parser/ParsingFlags.hpp
+++ b/src/rdf4cpp/parser/ParsingFlags.hpp
@@ -8,8 +8,8 @@ namespace rdf4cpp::parser {
 
 /**
  * Note that the syntax flags are mutually exclusive.
- * If none is used, Turtle is the default.
- * If more than one is used accidentally at the same time, TriG is likely the result (even if it does never get specified).
+ * If none is used, Auto is the default (auto-detect format from file extension and content).
+ * If more than one is used accidentally at the same time, the result is undefined.
  */
 enum struct ParsingFlag : uint8_t {
     Lax              = 1 << 0,
@@ -17,11 +17,14 @@ enum struct ParsingFlag : uint8_t {
     KeepBlankNodeIds = 1 << 2,
     NoParseBlankNode = 1 << 3,
 
-    Turtle   = 0b00 << 4, // default
-    NTriples = 0b01 << 4,
-    NQuads   = 0b10 << 4,
-    TriG     = 0b11 << 4,
-    RdfXml  = 0b100 << 4,
+    Auto     = 0b000 << 4, // default — auto-detect format
+    NTriples = 0b001 << 4,
+    NQuads   = 0b010 << 4,
+    TriG     = 0b011 << 4,
+    RdfXml   = 0b100 << 4,
+    Turtle   = 0b101 << 4,
+    OwlXml   = 0b110 << 4, // detected but not supported
+    JsonLd   = 0b111 << 4, // detected but not supported
 };
 constexpr uint8_t ParsingFlag_SyntaxMask = 0b111 << 4;
 
@@ -66,15 +69,24 @@ struct ParsingFlags {
     }
 
     /**
-     * @return the syntax ParsingFlag contained in this ParsingFlags. (Turtle if not specified)
+     * @return the syntax ParsingFlag contained in this ParsingFlags. (Auto if not specified)
      */
     [[nodiscard]] constexpr ParsingFlag get_syntax() const noexcept {
         return static_cast<ParsingFlag>(flags & static_cast<flag_u_type>(ParsingFlag_SyntaxMask));
     }
 
+    /**
+     * @return a copy of this ParsingFlags with the syntax bits replaced by the given syntax
+     */
+    [[nodiscard]] constexpr ParsingFlags with_syntax(ParsingFlag syntax) const noexcept {
+        auto new_flags = flags & ~static_cast<flag_u_type>(ParsingFlag_SyntaxMask);
+        new_flags |= static_cast<flag_u_type>(syntax);
+        return ParsingFlags{static_cast<uint8_t>(new_flags)};
+    }
+
     [[nodiscard]] constexpr bool syntax_allows_prefixes() const noexcept {
         auto const syn = get_syntax();
-        return syn == ParsingFlag::Turtle || syn ==  ParsingFlag::TriG;
+        return syn == ParsingFlag::Turtle || syn == ParsingFlag::TriG || syn == ParsingFlag::Auto;
     }
 };
 
diff --git a/src/rdf4cpp/parser/RDFFileParser.cpp b/src/rdf4cpp/parser/RDFFileParser.cpp
index 055a2db0e..3f5d70963 100644
--- a/src/rdf4cpp/parser/RDFFileParser.cpp
+++ b/src/rdf4cpp/parser/RDFFileParser.cpp
@@ -1,23 +1,62 @@
 #include "RDFFileParser.hpp"
 
-
+#include <rdf4cpp/parser/FormatGuess.hpp>
 #include <rdf4cpp/parser/IStreamQuadIteratorSerdImpl.hpp>
 
+#include <stdexcept>
+#include <vector>
+
 namespace rdf4cpp::parser {
+
+static constexpr size_t peek_size = 4096;
+
 RDFFileParser::RDFFileParser(const std::string &file_path, flags_type flags, state_type *state)
     : file_path_(file_path), flags_(flags), state_(state) {
 }
 RDFFileParser::RDFFileParser(std::string &&file_path, flags_type flags, state_type *state)
     : file_path_(std::move(file_path)), flags_(flags), state_(state) {
 }
+
 RDFFileParser::iterator RDFFileParser::begin() const {
     FILE *stream = fopen_fastseq(file_path_.c_str(), "r");
     if (stream == nullptr) {
         throw std::system_error{errno, std::system_category()};
     }
 
-    return {std::move(stream), flags_, state_};
+    auto flags = flags_;
+
+    if (flags.get_syntax() == ParsingFlag::Auto) {
+        // Peek content for sniffing
+        std::vector<char> buf(peek_size);
+        size_t bytes_read = fread(buf.data(), 1, peek_size, stream);
+        buf.resize(bytes_read);
+
+        // Rewind the stream so IStreamQuadIterator reads from start
+        if (fseek(stream, 0, SEEK_SET) != 0) {
+            fclose(stream);
+            throw std::runtime_error("Failed to rewind file stream for format detection");
+        }
+
+        std::string_view const prefix{buf.data(), buf.size()};
+        auto guess = guess_format(file_path_, prefix);
+
+        auto resolved = guess.is_known() ? guess.syntax : ParsingFlag::Turtle;
+
+        if (resolved == ParsingFlag::OwlXml) {
+            fclose(stream);
+            throw std::runtime_error("OWL/XML format is not supported. Please convert to RDF/XML or Turtle.");
+        }
+        if (resolved == ParsingFlag::JsonLd) {
+            fclose(stream);
+            throw std::runtime_error("JSON-LD format is not supported.");
+        }
+
+        flags = flags.with_syntax(resolved);
+    }
+
+    return {std::move(stream), flags, state_};
 }
+
 std::default_sentinel_t RDFFileParser::end() const noexcept {
     return {};
 }
@@ -48,6 +87,12 @@ RDFFileParser::iterator &RDFFileParser::iterator::operator++() {
 bool RDFFileParser::iterator::operator==(const RDFFileParser::iterator &other) const noexcept {
     return iter_ == other.iter_;
 }
+FormatGuess RDFFileParser::iterator::detected_format() const noexcept {
+    if (iter_) {
+        return iter_->detected_format();
+    }
+    return {};
+}
 bool operator==(const RDFFileParser::iterator &iter, std::default_sentinel_t s) noexcept {
     return (*iter.iter_) == s;
 }
diff --git a/src/rdf4cpp/parser/RDFFileParser.hpp b/src/rdf4cpp/parser/RDFFileParser.hpp
index a45ef47b9..da1128161 100644
--- a/src/rdf4cpp/parser/RDFFileParser.hpp
+++ b/src/rdf4cpp/parser/RDFFileParser.hpp
@@ -2,6 +2,7 @@
 #define RDF4CPP_RDFFILEPARSER_HPP
 
 #include <cstdio>
+#include <rdf4cpp/parser/FormatGuess.hpp>
 #include <rdf4cpp/parser/IStreamQuadIterator.hpp>
 
 namespace rdf4cpp::parser {
@@ -70,6 +71,7 @@ struct RDFFileParser {
         pointer operator->() const noexcept;
         iterator &operator++();
         [[nodiscard]] bool operator==(const iterator &other) const noexcept;
+        [[nodiscard]] FormatGuess detected_format() const noexcept;
         // != gets generated by compiler
     };
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2c38a3cd5..4a72658e6 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -369,6 +369,21 @@ target_link_libraries(tests_XMLParser
 )
 add_test(NAME tests_XMLParser COMMAND tests_XMLParser)
 
+add_executable(tests_FormatGuess parser/tests_FormatGuess.cpp)
+target_link_libraries(tests_FormatGuess
+        doctest::doctest
+        rdf4cpp
+)
+add_test(NAME tests_FormatGuess COMMAND tests_FormatGuess)
+
+add_executable(tests_FormatGuess_realworld parser/tests_FormatGuess_realworld.cpp)
+target_link_libraries(tests_FormatGuess_realworld
+        doctest::doctest
+        rdf4cpp
+        CURL::libcurl
+)
+add_test(NAME tests_FormatGuess_realworld COMMAND tests_FormatGuess_realworld)
+
 if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.nt")
         file(DOWNLOAD "https://files.tentris.dev/swdf.zip" "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.zip")
         execute_process(COMMAND unzip "${CMAKE_CURRENT_BINARY_DIR}/test_swdf/swdf.zip" -d "${CMAKE_CURRENT_BINARY_DIR}/test_swdf")
diff --git a/tests/parser/tests_FormatGuess.cpp b/tests/parser/tests_FormatGuess.cpp
new file mode 100644
index 000000000..135f02594
--- /dev/null
+++ b/tests/parser/tests_FormatGuess.cpp
@@ -0,0 +1,457 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+
+#include <doctest/doctest.h>
+
+#include <rdf4cpp.hpp>
+#include <rdf4cpp/parser/FormatGuess.hpp>
+#include <rdf4cpp/parser/RDFFileParser.hpp>
+
+#include <sstream>
+
+using namespace rdf4cpp;
+using namespace rdf4cpp::parser;
+
+TEST_SUITE("FormatGuess") {
+
+    TEST_CASE("guess_format_from_extension") {
+        SUBCASE("known extensions") {
+            CHECK(guess_format_from_extension(".ttl").syntax == ParsingFlag::Turtle);
+            CHECK(guess_format_from_extension(".ttl").confidence == GuessConfidence::High);
+            CHECK(guess_format_from_extension(".turtle").syntax == ParsingFlag::Turtle);
+
+            CHECK(guess_format_from_extension(".nt").syntax == ParsingFlag::NTriples);
+            CHECK(guess_format_from_extension(".nt").confidence == GuessConfidence::High);
+            CHECK(guess_format_from_extension(".ntriples").syntax == ParsingFlag::NTriples);
+
+            CHECK(guess_format_from_extension(".nq").syntax == ParsingFlag::NQuads);
+            CHECK(guess_format_from_extension(".nq").confidence == GuessConfidence::High);
+            CHECK(guess_format_from_extension(".nquads").syntax == ParsingFlag::NQuads);
+
+            CHECK(guess_format_from_extension(".trig").syntax == ParsingFlag::TriG);
+            CHECK(guess_format_from_extension(".trig").confidence == GuessConfidence::High);
+
+            CHECK(guess_format_from_extension(".rdf").syntax == ParsingFlag::RdfXml);
+            CHECK(guess_format_from_extension(".rdf").confidence == GuessConfidence::High);
+
+            CHECK(guess_format_from_extension(".owx").syntax == ParsingFlag::OwlXml);
+            CHECK(guess_format_from_extension(".owx").confidence == GuessConfidence::High);
+
+            CHECK(guess_format_from_extension(".jsonld").syntax == ParsingFlag::JsonLd);
+            CHECK(guess_format_from_extension(".jsonld").confidence == GuessConfidence::High);
+        }
+
+        SUBCASE("ambiguous extensions") {
+            CHECK(guess_format_from_extension(".owl").confidence == GuessConfidence::Low);
+            CHECK(guess_format_from_extension(".xml").confidence == GuessConfidence::Low);
+        }
+
+        SUBCASE("unknown extensions") {
+            CHECK_FALSE(guess_format_from_extension(".gz").is_known());
+            CHECK_FALSE(guess_format_from_extension(".csv").is_known());
+            CHECK_FALSE(guess_format_from_extension(".txt").is_known());
+            CHECK_FALSE(guess_format_from_extension("").is_known());
+        }
+
+        SUBCASE("case insensitive") {
+            CHECK(guess_format_from_extension(".TTL").syntax == ParsingFlag::Turtle);
+            CHECK(guess_format_from_extension(".Nt").syntax == ParsingFlag::NTriples);
+            CHECK(guess_format_from_extension(".NQ").syntax == ParsingFlag::NQuads);
+            CHECK(guess_format_from_extension(".TRIG").syntax == ParsingFlag::TriG);
+            CHECK(guess_format_from_extension(".RDF").syntax == ParsingFlag::RdfXml);
+            CHECK(guess_format_from_extension(".JSONLD").syntax == ParsingFlag::JsonLd);
+        }
+    }
+
+    TEST_CASE("guess_format_from_path") {
+        CHECK(guess_format_from_path("/path/to/file.ttl").syntax == ParsingFlag::Turtle);
+        CHECK(guess_format_from_path("/some/dir/data.nt").syntax == ParsingFlag::NTriples);
+        CHECK(guess_format_from_path("file.nq").syntax == ParsingFlag::NQuads);
+        CHECK(guess_format_from_path("/a/b/c.trig").syntax == ParsingFlag::TriG);
+        CHECK(guess_format_from_path("ontology.rdf").syntax == ParsingFlag::RdfXml);
+        CHECK(guess_format_from_path("data.jsonld").syntax == ParsingFlag::JsonLd);
+
+        SUBCASE("no extension") {
+            CHECK_FALSE(guess_format_from_path("/path/to/file").is_known());
+            CHECK_FALSE(guess_format_from_path("").is_known());
+        }
+
+        SUBCASE("path separators") {
+            CHECK(guess_format_from_path("C:\\Users\\data.ttl").syntax == ParsingFlag::Turtle);
+            CHECK(guess_format_from_path("/home/user/data.ttl").syntax == ParsingFlag::Turtle);
+        }
+    }
+
+    TEST_CASE("guess_format_from_content") {
+
+        SUBCASE("N-Triples") {
+            constexpr char const *nt_content =
+                "<http://example/s> <http://example/p> \"object\" .\n"
+                "<http://example/s> <http://example/p> <http://example/o> .\n";
+            auto guess = guess_format_from_content(nt_content);
+            CHECK(guess.syntax == ParsingFlag::NTriples);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("N-Quads") {
+            constexpr char const *nq_content =
+                "<http://example/s> <http://example/p> <http://example/o> <http://example/g> .\n"
+                "<http://example/s2> <http://example/p2> <http://example/o2> .\n";
+            auto guess = guess_format_from_content(nq_content);
+            CHECK(guess.syntax == ParsingFlag::NQuads);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("Turtle with @prefix") {
+            constexpr char const *ttl_content =
+                "@prefix ex: <http://example.org/> .\n"
+                "ex:s ex:p ex:o .\n";
+            auto guess = guess_format_from_content(ttl_content);
+            CHECK(guess.syntax == ParsingFlag::Turtle);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("Turtle with @base") {
+            constexpr char const *ttl_content =
+                "@base <http://example.org/> .\n"
+                "<s> <p> <o> .\n";
+            auto guess = guess_format_from_content(ttl_content);
+            CHECK(guess.syntax == ParsingFlag::Turtle);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("Turtle with SPARQL-style PREFIX") {
+            constexpr char const *ttl_content =
+                "PREFIX ex: <http://example.org/>\n"
+                "ex:s ex:p ex:o .\n";
+            auto guess = guess_format_from_content(ttl_content);
+            CHECK(guess.syntax == ParsingFlag::Turtle);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("TriG with GRAPH keyword") {
+            constexpr char const *trig_content =
+                "@prefix ex: <http://example.org/> .\n"
+                "GRAPH ex:g { ex:s ex:p ex:o . }\n";
+            auto guess = guess_format_from_content(trig_content);
+            CHECK(guess.syntax == ParsingFlag::TriG);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("TriG with curly braces") {
+            constexpr char const *trig_content =
+                "@prefix ex: <http://example.org/> .\n"
+                "ex:g { ex:s ex:p ex:o . }\n";
+            auto guess = guess_format_from_content(trig_content);
+            CHECK(guess.syntax == ParsingFlag::TriG);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("RDF/XML with xml declaration") {
+            constexpr char const *rdfxml_content =
+                "<?xml version=\"1.0\"?>\n"
+                "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">\n"
+                "</rdf:RDF>";
+            auto guess = guess_format_from_content(rdfxml_content);
+            CHECK(guess.syntax == ParsingFlag::RdfXml);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("RDF/XML without xml declaration") {
+            constexpr char const *rdfxml_content =
+                "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">\n"
+                "  <rdf:Description rdf:about=\"http://example.org/s\">\n"
+                "  </rdf:Description>\n"
+                "</rdf:RDF>";
+            auto guess = guess_format_from_content(rdfxml_content);
+            CHECK(guess.syntax == ParsingFlag::RdfXml);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("OWL/XML detection") {
+            constexpr char const *owlxml_content =
+                "<?xml version=\"1.0\"?>\n"
+                "<Ontology xmlns=\"http://www.w3.org/2002/07/owl#\">\n"
+                "  <Declaration><Class IRI=\"#Foo\"/></Declaration>\n"
+                "</Ontology>";
+            auto guess = guess_format_from_content(owlxml_content);
+            CHECK(guess.syntax == ParsingFlag::OwlXml);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("JSON-LD detection") {
+            constexpr char const *jsonld_content =
+                "{\n"
+                "  \"@context\": \"http://schema.org/\",\n"
+                "  \"@type\": \"Person\",\n"
+                "  \"name\": \"Jane Doe\"\n"
+                "}";
+            auto guess = guess_format_from_content(jsonld_content);
+            CHECK(guess.syntax == ParsingFlag::JsonLd);
+            CHECK(guess.is_known());
+        }
+
+        SUBCASE("empty content") {
+            CHECK_FALSE(guess_format_from_content("").is_known());
+            CHECK_FALSE(guess_format_from_content("   ").is_known());
+        }
+
+        SUBCASE("BOM handling") {
+            std::string bom_ttl = "\xEF\xBB\xBF@prefix ex: <http://example.org/> .\nex:s ex:p ex:o .\n";
+            auto guess = guess_format_from_content(bom_ttl);
+            CHECK(guess.syntax == ParsingFlag::Turtle);
+        }
+
+        SUBCASE("comment-only N-Triples") {
+            constexpr char const *nt_content =
+                "# This is a comment\n"
+                "<http://example/s> <http://example/p> <http://example/o> .\n";
+            auto guess = guess_format_from_content(nt_content);
+            CHECK(guess.syntax == ParsingFlag::NTriples);
+        }
+
+        SUBCASE("blank node subject in N-Triples") {
+            constexpr char const *nt_content =
+                "_:b1 <http://example/p> <http://example/o> .\n";
+            auto guess = guess_format_from_content(nt_content);
+            CHECK(guess.syntax == ParsingFlag::NTriples);
+        }
+    }
+
+    TEST_CASE("guess_format combined") {
+        SUBCASE("extension and content agree") {
+            constexpr char const *ttl_content = "@prefix ex: <http://example.org/> .\nex:s ex:p ex:o .\n";
+            auto guess = guess_format("/path/to/file.ttl", ttl_content);
+            CHECK(guess.syntax == ParsingFlag::Turtle);
+            CHECK(guess.confidence == GuessConfidence::Certain);
+        }
+
+        SUBCASE("extension high, content inconclusive") {
+            auto guess = guess_format("/path/to/file.ttl", "");
+            CHECK(guess.syntax == ParsingFlag::Turtle);
+            CHECK(guess.confidence == GuessConfidence::High);
+        }
+
+        SUBCASE("ambiguous extension, content disambiguates to RDF/XML") {
+            constexpr char const *rdfxml_content =
+                "<?xml version=\"1.0\"?>\n"
+                "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">\n"
+                "</rdf:RDF>";
+            auto guess = guess_format("/path/to/ontology.owl", rdfxml_content);
+            CHECK(guess.syntax == ParsingFlag::RdfXml);
+            CHECK(guess.confidence == GuessConfidence::High);
+        }
+
+        SUBCASE("ambiguous extension, content disambiguates to OWL/XML") {
+            constexpr char const *owlxml_content =
+                "<?xml version=\"1.0\"?>\n"
+                "<Ontology xmlns=\"http://www.w3.org/2002/07/owl#\">\n"
+                "</Ontology>";
+            auto guess = guess_format("/path/to/ontology.owl", owlxml_content);
+            CHECK(guess.syntax == ParsingFlag::OwlXml);
+        }
+
+        SUBCASE("no extension, content provides guess") {
+            constexpr char const *nt_content =
+                "<http://example/s> <http://example/p> <http://example/o> .\n";
+            auto guess = guess_format("/path/to/data", nt_content);
+            CHECK(guess.syntax == ParsingFlag::NTriples);
+        }
+    }
+
+    TEST_CASE("unsupported format errors") {
+        SUBCASE("OWL/XML throws on IStreamQuadIterator") {
+            constexpr char const *owlxml_content =
+                "<?xml version=\"1.0\"?>\n"
+                "<Ontology xmlns=\"http://www.w3.org/2002/07/owl#\">\n"
+                "  <Declaration><Class IRI=\"#Foo\"/></Declaration>\n"
+                "</Ontology>";
+            std::istringstream iss{owlxml_content};
+            CHECK_THROWS_AS(IStreamQuadIterator{iss
+            }, std::runtime_error);
+        }
+
+        SUBCASE("JSON-LD throws on IStreamQuadIterator") {
+            constexpr char const *jsonld_content =
+                "{\"@context\": \"http://schema.org/\", \"name\": \"Jane\"}";
+            std::istringstream iss{jsonld_content};
+            CHECK_THROWS_AS(IStreamQuadIterator{iss}, std::runtime_error);
+        }
+
+        SUBCASE("explicit OwlXml flag throws") {
+            std::istringstream iss{"whatever"};
+            CHECK_THROWS_AS((IStreamQuadIterator{iss, ParsingFlag::OwlXml}), std::runtime_error);
+        }
+
+        SUBCASE("explicit JsonLd flag throws") {
+            std::istringstream iss{"whatever"};
+            CHECK_THROWS_AS((IStreamQuadIterator{iss, ParsingFlag::JsonLd}), std::runtime_error);
+        }
+    }
+
+    TEST_CASE("Auto mode end-to-end with IStreamQuadIterator") {
+
+        SUBCASE("N-Triples auto-detected") {
+            constexpr char const *nt_content =
+                "<http://example/s> <http://example/p> \"hello\" .\n"
+                "<http://example/s> <http://example/p> <http://example/o> .\n";
+            std::istringstream iss{nt_content};
+            IStreamQuadIterator qit{iss};  // Auto mode (default)
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::NTriples);
+            size_t n = 0;
+            for (; qit != std::default_sentinel; ++qit) {
+                CHECK(qit->has_value());
+                ++n;
+            }
+            CHECK_EQ(n, 2);
+        }
+
+        SUBCASE("Turtle auto-detected via @prefix") {
+            constexpr char const *ttl_content =
+                "@prefix ex: <http://example.org/> .\n"
+                "ex:s ex:p \"test\" .\n";
+            std::istringstream iss{ttl_content};
+            IStreamQuadIterator qit{iss};
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::Turtle);
+            size_t n = 0;
+            for (; qit != std::default_sentinel; ++qit) {
+                CHECK(qit->has_value());
+                ++n;
+            }
+            CHECK_EQ(n, 1);
+        }
+
+        SUBCASE("N-Quads auto-detected") {
+            constexpr char const *nq_content =
+                "<http://example/s> <http://example/p> <http://example/o> <http://example/g> .\n";
+            std::istringstream iss{nq_content};
+            IStreamQuadIterator qit{iss};
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::NQuads);
+            CHECK(qit != std::default_sentinel);
+            CHECK(qit->has_value());
+            CHECK(qit->value().graph() == IRI{"http://example/g"});
+        }
+
+        SUBCASE("RDF/XML auto-detected") {
+            constexpr char const *rdfxml_content =
+                "<?xml version=\"1.0\"?>\n"
+                "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"
+                "         xmlns:ex=\"http://example.org/\">\n"
+                "  <rdf:Description rdf:about=\"http://example.org/s\">\n"
+                "    <ex:p>hello</ex:p>\n"
+                "  </rdf:Description>\n"
+                "</rdf:RDF>";
+            std::istringstream iss{rdfxml_content};
+            IStreamQuadIterator qit{iss};
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::RdfXml);
+            CHECK(qit != std::default_sentinel);
+            CHECK(qit->has_value());
+        }
+    }
+
+    TEST_CASE("Explicit flags backward compatibility") {
+
+        SUBCASE("explicit Turtle") {
+            constexpr char const *ttl_content =
+                "@prefix ex: <http://example.org/> .\n"
+                "ex:s ex:p \"test\" .\n";
+            std::istringstream iss{ttl_content};
+            IStreamQuadIterator qit{iss, ParsingFlag::Turtle};
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::Turtle);
+            CHECK(qit != std::default_sentinel);
+            CHECK(qit->has_value());
+        }
+
+        SUBCASE("explicit NTriples") {
+            std::istringstream iss{R"(<http://example/s> <http://example/p> "string" .)"};
+            IStreamQuadIterator qit{iss, ParsingFlag::NTriples};
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::NTriples);
+            CHECK(qit != std::default_sentinel);
+            CHECK(qit->has_value());
+        }
+
+        SUBCASE("explicit NQuads") {
+            std::stringstream str{"<http://example/s> <http://example/p> <http://example/o> <http://example/g> .\n"};
+            IStreamQuadIterator qit{str, ParsingFlag::NQuads};
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::NQuads);
+            CHECK(qit != std::default_sentinel);
+            CHECK(qit->has_value());
+        }
+
+        SUBCASE("explicit TriG") {
+            std::stringstream str{"<http://example/g> {<http://example/s> <http://example/p> <http://example/o> .}"};
+            IStreamQuadIterator qit{str, ParsingFlag::TriG};
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::TriG);
+            CHECK(qit != std::default_sentinel);
+            CHECK(qit->has_value());
+        }
+
+        SUBCASE("explicit RdfXml") {
+            constexpr char const *rdfxml_content =
+                "<?xml version=\"1.0\"?>\n"
+                "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"
+                "         xmlns:ex=\"http://example.org/\">\n"
+                "  <rdf:Description rdf:about=\"http://example.org/s\">\n"
+                "    <ex:p>hello</ex:p>\n"
+                "  </rdf:Description>\n"
+                "</rdf:RDF>";
+            std::istringstream iss{rdfxml_content};
+            IStreamQuadIterator qit{iss, ParsingFlag::RdfXml};
+
+            CHECK(qit.detected_format().syntax == ParsingFlag::RdfXml);
+            CHECK(qit != std::default_sentinel);
+            CHECK(qit->has_value());
+        }
+    }
+
+    TEST_CASE("RDFFileParser auto mode") {
+
+        SUBCASE("ttl file auto-detected") {
+            // tests_RDFFileParser_simple.ttl is a pure N-Triples file (no prefixes)
+            // Auto mode should detect it and parse correctly
+            size_t count = 0;
+            for (auto const &v : RDFFileParser{"./tests_RDFFileParser_simple.ttl"}) {
+                if (v.has_value()) {
+                    ++count;
+                } else if (count == 3) {
+                    // expected error on the invalid date
+                    ++count;
+                }
+            }
+            CHECK(count == 4);
+        }
+    }
+
+    TEST_CASE("Auto mode with fopen C-like API") {
+        SUBCASE("N-Triples via fopen") {
+            static constexpr char const *path = "/tmp/rdf4cpp-format-guess-test.nt";
+            {
+                auto *f = fopen(path, "w");
+                fprintf(f, "<http://example/s> <http://example/p> \"hello\" .\n");
+                fclose(f);
+            }
+
+            auto *f = fopen(path, "r");
+            IStreamQuadIterator qit{f,
+                                     reinterpret_cast<ReadFunc>(fread),
+                                     reinterpret_cast<ErrorFunc>(ferror),
+                                     reinterpret_cast<EOFFunc>(feof)};
+            CHECK(qit.detected_format().syntax == ParsingFlag::NTriples);
+            CHECK(qit != std::default_sentinel);
+            CHECK(qit->has_value());
+
+            ++qit;
+            CHECK(qit == std::default_sentinel);
+
+            fclose(f);
+            remove(path);
+        }
+    }
+}
diff --git a/tests/parser/tests_FormatGuess_realworld.cpp b/tests/parser/tests_FormatGuess_realworld.cpp
new file mode 100644
index 000000000..55c5dd34f
--- /dev/null
+++ b/tests/parser/tests_FormatGuess_realworld.cpp
@@ -0,0 +1,322 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+
+#include <doctest/doctest.h>
+
+#include <rdf4cpp.hpp>
+#include <rdf4cpp/parser/FormatGuess.hpp>
+
+#include <curl/curl.h>
+#include <sstream>
+#include <string>
+
+using namespace rdf4cpp;
+using namespace rdf4cpp::parser;
+
+// --- CURL helper (adopted from tests_XMLParser.cpp) ---
+
+static size_t write_callback(void const *contents, size_t size, size_t nmemb, void *userp) {
+    static_cast<std::string *>(userp)->append(static_cast<char const *>(contents), size * nmemb);
+    return size * nmemb;
+}
+
+static std::string fetch_url(std::string const &url) {
+    std::string result;
+    CURL *curl = curl_easy_init();
+    REQUIRE(curl != nullptr);
+    curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
+    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &result);
+    curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
+    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30L);
+    CURLcode res = curl_easy_perform(curl);
+    curl_easy_cleanup(curl);
+    REQUIRE_EQ(res, CURLE_OK);
+    return result;
+}
+
+/**
+ * Check content sniffing detects expected format, check combined guess with filename,
+ * and for supported formats verify auto-mode parsing produces quads.
+ * For unsupported formats, verify an exception is thrown.
+ */
+static void check_detection_and_parse(std::string const &content,
+                                       std::string const &filename,
+                                       ParsingFlag expected_syntax,
+                                       bool expect_parseable) {
+    INFO("file: ", filename);
+
+    auto prefix = std::string_view{content}.substr(0, 4096);
+    auto content_guess = guess_format_from_content(prefix);
+    CHECK_MESSAGE(content_guess.is_known(), "Content sniffing should produce a known format for ", filename);
+    CHECK_MESSAGE(content_guess.syntax == expected_syntax,
+                  "Expected syntax ", static_cast<int>(expected_syntax),
+                  " but got ", static_cast<int>(content_guess.syntax), " for ", filename);
+
+    auto combined = guess_format(filename, prefix);
+    CHECK_MESSAGE(combined.is_known(), "Combined guess should be known for ", filename);
+    CHECK_MESSAGE(combined.syntax == expected_syntax,
+                  "Combined guess expected ", static_cast<int>(expected_syntax),
+                  " but got ", static_cast<int>(combined.syntax), " for ", filename);
+
+    if (expect_parseable) {
+        std::istringstream iss{content};
+        IStreamQuadIterator qit{iss};
+        CHECK_MESSAGE(qit.detected_format().syntax == expected_syntax,
+                      "Parser detected wrong format for ", filename);
+        size_t quad_count = 0;
+        for (; qit != std::default_sentinel; ++qit) {
+            if (qit->has_value()) ++quad_count;
+        }
+        CHECK_MESSAGE(quad_count > 0, "Expected at least one quad from ", filename);
+    } else {
+        std::istringstream iss{content};
+        CHECK_THROWS_AS(IStreamQuadIterator{iss}, std::runtime_error);
+    }
+}
+
+// ============================================================================
+// Real-world file tests — download and verify detection + parsing
+// ============================================================================
+
+TEST_SUITE("FormatGuess real-world files") {
+
+    // --- N-Triples (.nt) ---
+
+    TEST_CASE("N-Triples: W3C rdf-n-triples tests") {
+        auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-n-triples/");
+
+        SUBCASE("literal.nt") {
+            check_detection_and_parse(fetch_url(base + "literal.nt"), "literal.nt", ParsingFlag::NTriples, true);
+        }
+        SUBCASE("literal_all_controls.nt") {
+            check_detection_and_parse(fetch_url(base + "literal_all_controls.nt"), "literal_all_controls.nt", ParsingFlag::NTriples, true);
+        }
+        SUBCASE("langtagged_string.nt") {
+            check_detection_and_parse(fetch_url(base + "langtagged_string.nt"), "langtagged_string.nt", ParsingFlag::NTriples, true);
+        }
+        SUBCASE("comment_following_triple.nt") {
+            check_detection_and_parse(fetch_url(base + "comment_following_triple.nt"), "comment_following_triple.nt", ParsingFlag::NTriples, true);
+        }
+        SUBCASE("literal_true.nt") {
+            check_detection_and_parse(fetch_url(base + "literal_true.nt"), "literal_true.nt", ParsingFlag::NTriples, true);
+        }
+    }
+
+    TEST_CASE("N-Triples: W3C Turtle test outputs (.nt)") {
+        auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-turtle/");
+
+        SUBCASE("first.nt") {
+            check_detection_and_parse(fetch_url(base + "first.nt"), "first.nt", ParsingFlag::NTriples, true);
+        }
+        SUBCASE("IRIREF_datatype.nt") {
+            check_detection_and_parse(fetch_url(base + "IRIREF_datatype.nt"), "IRIREF_datatype.nt", ParsingFlag::NTriples, true);
+        }
+        SUBCASE("bareword_a_predicate.nt") {
+            check_detection_and_parse(fetch_url(base + "bareword_a_predicate.nt"), "bareword_a_predicate.nt", ParsingFlag::NTriples, true);
+        }
+    }
+
+    TEST_CASE("N-Triples: Serd test outputs") {
+        auto const base = std::string("https://raw.githubusercontent.com/drobilla/serd/main/test/extra/abbreviate/");
+
+        SUBCASE("collapse-predicates.nt") {
+            check_detection_and_parse(fetch_url(base + "collapse-predicates.nt"), "collapse-predicates.nt", ParsingFlag::NTriples, true);
+        }
+        SUBCASE("collapse-subjects.nt") {
+            check_detection_and_parse(fetch_url(base + "collapse-subjects.nt"), "collapse-subjects.nt", ParsingFlag::NTriples, true);
+        }
+    }
+
+    // --- N-Quads (.nq) ---
+
+    TEST_CASE("N-Quads: W3C rdf-trig test outputs (.nq)") {
+        auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-trig/");
+
+        // NQ files with 4 terms should be detected as NQuads
+        SUBCASE("alternating_iri_graphs.nq") {
+            check_detection_and_parse(fetch_url(base + "alternating_iri_graphs.nq"), "alternating_iri_graphs.nq", ParsingFlag::NQuads, true);
+        }
+    }
+
+    // --- Turtle (.ttl) ---
+
+    TEST_CASE("Turtle: W3C rdf-turtle tests") {
+        auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-turtle/");
+
+        SUBCASE("first.ttl") {
+            check_detection_and_parse(fetch_url(base + "first.ttl"), "first.ttl", ParsingFlag::Turtle, true);
+        }
+        SUBCASE("SPARQL_style_prefix.ttl") {
+            check_detection_and_parse(fetch_url(base + "SPARQL_style_prefix.ttl"), "SPARQL_style_prefix.ttl", ParsingFlag::Turtle, true);
+        }
+        SUBCASE("SPARQL_style_base.ttl") {
+            check_detection_and_parse(fetch_url(base + "SPARQL_style_base.ttl"), "SPARQL_style_base.ttl", ParsingFlag::Turtle, true);
+        }
+        SUBCASE("bareword_a_predicate.ttl") {
+            check_detection_and_parse(fetch_url(base + "bareword_a_predicate.ttl"), "bareword_a_predicate.ttl", ParsingFlag::Turtle, true);
+        }
+        SUBCASE("collection_object.ttl") {
+            check_detection_and_parse(fetch_url(base + "collection_object.ttl"), "collection_object.ttl", ParsingFlag::Turtle, true);
+        }
+        SUBCASE("labeled_blank_node_subject.ttl") {
+            // Content is `_:s <p> <o> .` which is valid N-Triples (subset of Turtle).
+            // Content sniffing correctly detects NTriples; combined guess with .ttl
+            // extension overrides to Turtle.
+            auto content = fetch_url(base + "labeled_blank_node_subject.ttl");
+            auto prefix = std::string_view{content}.substr(0, 4096);
+            auto content_guess = guess_format_from_content(prefix);
+            CHECK(content_guess.syntax == ParsingFlag::NTriples);
+
+            auto combined = guess_format("labeled_blank_node_subject.ttl", prefix);
+            CHECK(combined.syntax == ParsingFlag::Turtle);
+
+            std::istringstream iss{content};
+            IStreamQuadIterator qit{iss};
+            size_t count = 0;
+            for (; qit != std::default_sentinel; ++qit) {
+                if (qit->has_value()) ++count;
+            }
+            CHECK(count > 0);
+        }
+    }
+
+    TEST_CASE("Turtle: Serd project file") {
+        SUBCASE("serd.ttl") {
+            check_detection_and_parse(fetch_url("https://raw.githubusercontent.com/drobilla/serd/main/serd.ttl"), "serd.ttl", ParsingFlag::Turtle, true);
+        }
+    }
+
+    TEST_CASE("Turtle: Serd abbreviation tests") {
+        auto const base = std::string("https://raw.githubusercontent.com/drobilla/serd/main/test/extra/abbreviate/");
+
+        SUBCASE("collapse-predicates.ttl") {
+            check_detection_and_parse(fetch_url(base + "collapse-predicates.ttl"), "collapse-predicates.ttl", ParsingFlag::Turtle, true);
+        }
+        SUBCASE("collapse-subjects.ttl") {
+            check_detection_and_parse(fetch_url(base + "collapse-subjects.ttl"), "collapse-subjects.ttl", ParsingFlag::Turtle, true);
+        }
+    }
+
+    // --- TriG (.trig) ---
+
+    TEST_CASE("TriG: W3C rdf-trig tests") {
+        auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-trig/");
+
+        SUBCASE("LITERAL1.trig") {
+            check_detection_and_parse(fetch_url(base + "LITERAL1.trig"), "LITERAL1.trig", ParsingFlag::TriG, true);
+        }
+        SUBCASE("trig-kw-graph-01.trig") {
+            check_detection_and_parse(fetch_url(base + "trig-kw-graph-01.trig"), "trig-kw-graph-01.trig", ParsingFlag::TriG, true);
+        }
+        SUBCASE("alternating_iri_graphs.trig") {
+            check_detection_and_parse(fetch_url(base + "alternating_iri_graphs.trig"), "alternating_iri_graphs.trig", ParsingFlag::TriG, true);
+        }
+        SUBCASE("anonymous_blank_node_graph.trig") {
+            check_detection_and_parse(fetch_url(base + "anonymous_blank_node_graph.trig"), "anonymous_blank_node_graph.trig", ParsingFlag::TriG, true);
+        }
+        SUBCASE("labeled_blank_node_graph.trig") {
+            check_detection_and_parse(fetch_url(base + "labeled_blank_node_graph.trig"), "labeled_blank_node_graph.trig", ParsingFlag::TriG, true);
+        }
+    }
+
+    // --- RDF/XML (.rdf) ---
+
+    TEST_CASE("RDF/XML: W3C rdf-xml tests") {
+        auto const base = std::string("https://raw.githubusercontent.com/w3c/rdf-tests/main/rdf/rdf11/rdf-xml/");
+
+        SUBCASE("amp-in-url/test001.rdf") {
+            check_detection_and_parse(fetch_url(base + "amp-in-url/test001.rdf"), "test001.rdf", ParsingFlag::RdfXml, true);
+        }
+        SUBCASE("datatypes/test001.rdf") {
+            check_detection_and_parse(fetch_url(base + "datatypes/test001.rdf"), "datatypes_test001.rdf", ParsingFlag::RdfXml, true);
+        }
+        SUBCASE("rdf-charmod-literals/test001.rdf") {
+            check_detection_and_parse(fetch_url(base + "rdf-charmod-literals/test001.rdf"), "rdf-charmod-literals_test001.rdf", ParsingFlag::RdfXml, true);
+        }
+        SUBCASE("rdf-charmod-uris/test001.rdf") {
+            check_detection_and_parse(fetch_url(base + "rdf-charmod-uris/test001.rdf"), "rdf-charmod-uris_test001.rdf", ParsingFlag::RdfXml, true);
+        }
+        SUBCASE("rdf-containers-syntax-vs-schema/test001.rdf") {
+            check_detection_and_parse(fetch_url(base + "rdf-containers-syntax-vs-schema/test001.rdf"), "rdf-containers_test001.rdf", ParsingFlag::RdfXml, true);
+        }
+    }
+
+    TEST_CASE("RDF/XML: .owl files with rdf:RDF root (ambiguous extension)") {
+        SUBCASE("pizza.owl — RDF/XML with .owl extension") {
+            auto content = fetch_url("https://raw.githubusercontent.com/owlcs/pizza-ontology/master/pizza.owl");
+            auto prefix = std::string_view{content}.substr(0, 4096);
+            auto guess = guess_format("pizza.owl", prefix);
+            CHECK(guess.syntax == ParsingFlag::RdfXml);
+            CHECK(guess.is_known());
+
+            std::istringstream iss{content};
+            IStreamQuadIterator qit{iss};
+            CHECK(qit.detected_format().syntax == ParsingFlag::RdfXml);
+            size_t count = 0;
+            for (; qit != std::default_sentinel; ++qit) {
+                if (qit->has_value()) ++count;
+            }
+            CHECK(count > 0);
+        }
+    }
+
+    // --- OWL/XML (.owx) — detected but unsupported ---
+
+    TEST_CASE("OWL/XML: horned-owl test files (.owx)") {
+        auto const base = std::string("https://raw.githubusercontent.com/phillord/horned-owl/main/src/ont/owl-xml/");
+
+        SUBCASE("class.owx") {
+            check_detection_and_parse(fetch_url(base + "class.owx"), "class.owx", ParsingFlag::OwlXml, false);
+        }
+        SUBCASE("annotation.owx") {
+            check_detection_and_parse(fetch_url(base + "annotation.owx"), "annotation.owx", ParsingFlag::OwlXml, false);
+        }
+        SUBCASE("and.owx") {
+            check_detection_and_parse(fetch_url(base + "and.owx"), "and.owx", ParsingFlag::OwlXml, false);
+        }
+        SUBCASE("class-assertion.owx") {
+            check_detection_and_parse(fetch_url(base + "class-assertion.owx"), "class-assertion.owx", ParsingFlag::OwlXml, false);
+        }
+        SUBCASE("ontology-annotation.owx") {
+            check_detection_and_parse(fetch_url(base + "ontology-annotation.owx"), "ontology-annotation.owx", ParsingFlag::OwlXml, false);
+        }
+    }
+
+    TEST_CASE("OWL/XML: .owl files with Ontology root (ambiguous extension)") {
+        SUBCASE("Time.owl — OWL/XML with .owl extension") {
+            auto content = fetch_url("https://raw.githubusercontent.com/usnistgov/pdso/master/OWL/Time.owl");
+            auto prefix = std::string_view{content}.substr(0, 4096);
+            auto guess = guess_format("Time.owl", prefix);
+            CHECK(guess.syntax == ParsingFlag::OwlXml);
+
+            std::istringstream iss{content};
+            CHECK_THROWS_AS(IStreamQuadIterator{iss}, std::runtime_error);
+        }
+    }
+
+    // --- JSON-LD (.jsonld) — detected but unsupported ---
+
+    TEST_CASE("JSON-LD: W3C json-ld-api examples") {
+        auto const base = std::string("https://raw.githubusercontent.com/w3c/json-ld-api/main/examples/");
+
+        SUBCASE("Sample-JSON-LD-document.jsonld") {
+            check_detection_and_parse(fetch_url(base + "Sample-JSON-LD-document.jsonld"), "Sample-JSON-LD-document.jsonld", ParsingFlag::JsonLd, false);
+        }
+        SUBCASE("Compacted-sample-document-compacted.jsonld") {
+            check_detection_and_parse(fetch_url(base + "Compacted-sample-document-compacted.jsonld"), "Compacted-sample-document-compacted.jsonld", ParsingFlag::JsonLd, false);
+        }
+        SUBCASE("Expanded-sample-document.jsonld") {
+            check_detection_and_parse(fetch_url(base + "Expanded-sample-document.jsonld"), "Expanded-sample-document.jsonld", ParsingFlag::JsonLd, false);
+        }
+        SUBCASE("JSON-LD-document-in-compact-form.jsonld") {
+            check_detection_and_parse(fetch_url(base + "JSON-LD-document-in-compact-form.jsonld"), "JSON-LD-document-in-compact-form.jsonld", ParsingFlag::JsonLd, false);
+        }
+    }
+
+    TEST_CASE("JSON-LD: json-ld.org context files") {
+        SUBCASE("person.jsonld") {
+            check_detection_and_parse(fetch_url("https://raw.githubusercontent.com/json-ld/json-ld.org/main/contexts/person.jsonld"), "person.jsonld", ParsingFlag::JsonLd, false);
+        }
+    }
+}
diff --git a/tests/parser/tests_IStreamQuadIterator.cpp b/tests/parser/tests_IStreamQuadIterator.cpp
index 8df3754f8..f06b5f152 100644
--- a/tests/parser/tests_IStreamQuadIterator.cpp
+++ b/tests/parser/tests_IStreamQuadIterator.cpp
@@ -328,7 +328,7 @@ TEST_SUITE("IStreamQuadIterator") {
         constexpr char const *triples = "<a> <b> _:bnode .\n";
 
         std::istringstream iss{triples};
-        IStreamQuadIterator qit{iss, ParsingFlag::NoParseBlankNode};
+        IStreamQuadIterator qit{iss, ParsingFlag::NoParseBlankNode | ParsingFlag::Turtle};
 
         CHECK_NE(qit, std::default_sentinel);
         CHECK(!qit->has_value());

From 544c767553a154bfd761dafdd646c344d02cb827 Mon Sep 17 00:00:00 2001
From: bigerl <alexander@bigerl.eu>
Date: Sun, 1 Mar 2026 20:08:14 +0100
Subject: [PATCH 2/7] improved FormatGuess + docs

---
 src/rdf4cpp/parser/FormatGuess.cpp | 273 +++++++++++++++--------------
 src/rdf4cpp/parser/FormatGuess.md  |  80 +++++++++
 2 files changed, 217 insertions(+), 136 deletions(-)
 create mode 100644 src/rdf4cpp/parser/FormatGuess.md

diff --git a/src/rdf4cpp/parser/FormatGuess.cpp b/src/rdf4cpp/parser/FormatGuess.cpp
index c907d69e8..e3422dda4 100644
--- a/src/rdf4cpp/parser/FormatGuess.cpp
+++ b/src/rdf4cpp/parser/FormatGuess.cpp
@@ -1,62 +1,85 @@
 #include "FormatGuess.hpp"
 
 #include <algorithm>
-#include <cctype>
 #include <string>
 
+#include <uni_algo/all.h>
+
 namespace rdf4cpp::parser {
 
     // --- helpers ---
 
-    static std::string to_lower(std::string_view sv) {
-        std::string s{sv};
-        std::ranges::transform(s, s.begin(), [](unsigned char c) {
-            return std::tolower(c);
-        });
-        return s;
+    struct SplitLine {
+        std::string_view line;
+        std::string_view rest;
+    };
+
+    // RDF syntax delimiters are ASCII bytes (0x00-0x7F) which never appear as
+    // UTF-8 continuation bytes.  Byte-level scanning for these markers is safe
+    // in valid UTF-8.
+
+    static std::string to_lower(std::string_view const sv) {
+        return una::cases::to_lowercase_utf8(sv);
     }
 
-    static std::string_view skip_whitespace_and_bom(std::string_view sv) {
-        // skip UTF-8 BOM
-        if (sv.size() >= 3 && sv[0] == '\xEF' && sv[1] == '\xBB' && sv[2] == '\xBF') {
-            sv.remove_prefix(3);
+    static bool is_ascii_ws(char const c) noexcept {
+        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+    }
+
+    static std::string_view ltrim_ascii_whitespace(std::string_view const sv) noexcept {
+        auto const it = std::ranges::find_if_not(sv, is_ascii_ws);
+        return sv.substr(static_cast<size_t>(it - sv.begin()));
+    }
+
+    static std::string_view trim_ascii_whitespace(std::string_view const sv) noexcept {
+        auto result = ltrim_ascii_whitespace(sv);
+        while (!result.empty() && is_ascii_ws(result.back())) {
+            result.remove_suffix(1);
+        }
+        return result;
+    }
+
+    /// Split into first line and everything after the newline. If no newline, rest is empty.
+    static SplitLine split_next_line(std::string_view const sv) noexcept {
+        auto const eol = sv.find('\n');
+        if (eol == std::string_view::npos) {
+            return {.line = sv, .rest = {}};
         }
-        // skip leading whitespace
-        while (!sv.empty() && (sv.front() == ' ' || sv.front() == '\t' || sv.front() == '\n' || sv.front() == '\r')) {
-            sv.remove_prefix(1);
+        return {.line = sv.substr(0, eol), .rest = sv.substr(eol + 1)};
+    }
+
+    static std::string_view skip_whitespace_and_bom(std::string_view const sv) {
+        // skip UTF-8 BOM
+        if (sv.starts_with("\xEF\xBB\xBF")) {
+            return ltrim_ascii_whitespace(sv.substr(3));
         }
-        return sv;
+        return ltrim_ascii_whitespace(sv);
     }
 
-    static bool starts_with_icase(std::string_view haystack, std::string_view needle) {
+    static bool starts_with_icase(std::string_view const haystack, std::string_view const needle) {
         if (haystack.size() < needle.size()) {
             return false;
         }
-        for (size_t i = 0; i < needle.size(); ++i) {
-            if (std::tolower(static_cast<unsigned char>(haystack[i])) != std::tolower(static_cast<unsigned char>(needle[i]))) {
-                return false;
-            }
-        }
-        return true;
+        return una::cases::to_lowercase_utf8(haystack.substr(0, needle.size())) == una::cases::to_lowercase_utf8(needle);
     }
 
-    static bool contains(std::string_view haystack, std::string_view needle) {
+    static bool contains(std::string_view const haystack, std::string_view const needle) {
         return haystack.find(needle) != std::string_view::npos;
     }
 
-    static bool contains_icase(std::string_view haystack, std::string_view needle) {
+    static bool contains_icase(std::string_view const haystack, std::string_view const needle) {
         if (needle.size() > haystack.size()) {
             return false;
         }
-        auto lower_hay = to_lower(haystack);
-        auto lower_needle = to_lower(needle);
+        auto const lower_hay = to_lower(haystack);
+        auto const lower_needle = to_lower(needle);
         return lower_hay.find(lower_needle) != std::string::npos;
     }
 
     // --- extension mapping ---
 
-    FormatGuess guess_format_from_extension(std::string_view extension) noexcept {
-        auto ext = to_lower(extension);
+    FormatGuess guess_format_from_extension(std::string_view const extension) noexcept {
+        auto const ext = to_lower(extension);
 
         if (ext == ".ttl" || ext == ".turtle") {
             return {ParsingFlag::Turtle, GuessConfidence::High};
@@ -88,7 +111,7 @@ namespace rdf4cpp::parser {
         return {ParsingFlag::Auto, GuessConfidence::None};
     }
 
-    FormatGuess guess_format_from_path(std::string_view file_path) noexcept {
+    FormatGuess guess_format_from_path(std::string_view const file_path) noexcept {
         // find last path separator
         auto const last_sep = file_path.find_last_of("/\\");
         auto const filename = (last_sep != std::string_view::npos) ? file_path.substr(last_sep + 1) : file_path;
@@ -104,9 +127,8 @@ namespace rdf4cpp::parser {
 
     // --- content sniffing ---
 
-    static bool has_trig_markers(std::string_view content) {
+    static bool has_trig_markers(std::string_view const content) {
         // Look for GRAPH keyword or { } blocks outside of string literals
-        // Simple heuristic: look for GRAPH keyword or standalone { not inside quotes
         if (contains_icase(content, "GRAPH")) {
             return true;
         }
@@ -115,11 +137,13 @@ namespace rdf4cpp::parser {
         // or just { at start of a line (default graph block)
         bool in_string = false;
         char string_delim = 0;
-        for (size_t i = 0; i < content.size(); ++i) {
-            char c = content[i];
+        auto cursor = content;
+        while (!cursor.empty()) {
+            char const c = cursor.front();
+            cursor.remove_prefix(1);
             if (in_string) {
-                if (c == '\\' && i + 1 < content.size()) {
-                    ++i;  // skip escaped char
+                if (c == '\\' && !cursor.empty()) {
+                    cursor.remove_prefix(1);  // skip escaped char
                     continue;
                 }
                 if (c == string_delim) {
@@ -139,15 +163,15 @@ namespace rdf4cpp::parser {
         return false;
     }
 
-    static FormatGuess sniff_xml_content(std::string_view content) {
+    static FormatGuess sniff_xml_content(std::string_view const content) {
         // Check for OWL/XML first — more specific markers.
         // OWL/XML uses <Ontology> root element (not <rdf:RDF>) and may still
         // declare xmlns:rdf as a namespace prefix, so checking OWL/XML before
         // RDF/XML avoids false positives.
-        bool has_ontology_root = contains(content, "<Ontology");
-        bool has_owl_ns = contains(content, "xmlns=\"http://www.w3.org/2002/07/owl#\"");
-        bool has_rdf_root = contains(content, "<rdf:RDF");
-        bool has_rdf_desc = contains(content, "<rdf:Description");
+        bool const has_ontology_root = contains(content, "<Ontology");
+        bool const has_owl_ns = contains(content, "xmlns=\"http://www.w3.org/2002/07/owl#\"");
+        bool const has_rdf_root = contains(content, "<rdf:RDF");
+        bool const has_rdf_desc = contains(content, "<rdf:Description");
 
         if (has_ontology_root && !has_rdf_root) {
             return {ParsingFlag::OwlXml, GuessConfidence::High};
@@ -165,33 +189,24 @@ namespace rdf4cpp::parser {
         return {ParsingFlag::RdfXml, GuessConfidence::Low};
     }
 
-    static FormatGuess sniff_json_content(std::string_view content) {
+    static FormatGuess sniff_json_content(std::string_view const content) {
         if (contains(content, "\"@context\"") || contains(content, "\"@id\"") || contains(content, "\"@graph\"")) {
             return {ParsingFlag::JsonLd, GuessConfidence::High};
         }
         return {ParsingFlag::Auto, GuessConfidence::None};
     }
 
-    static FormatGuess sniff_ntriples_or_nquads(std::string_view content) {
+    static FormatGuess sniff_ntriples_or_nquads(std::string_view const content) {
         // Scan lines looking for N-Triples/N-Quads patterns:
         // Lines of <iri> <iri> <obj> . (3 terms = NT, 4 terms = NQ)
         bool found_4_terms = false;
         bool found_any_triple = false;
 
-        size_t pos = 0;
-        while (pos < content.size()) {
-            // find end of line
-            auto eol = content.find('\n', pos);
-            auto line = content.substr(pos, eol == std::string_view::npos ? std::string_view::npos : eol - pos);
-            pos = (eol == std::string_view::npos) ? content.size() : eol + 1;
-
-            // trim
-            while (!line.empty() && (line.front() == ' ' || line.front() == '\t')) {
-                line.remove_prefix(1);
-            }
-            while (!line.empty() && (line.back() == ' ' || line.back() == '\t' || line.back() == '\r')) {
-                line.remove_suffix(1);
-            }
+        auto remaining = content;
+        while (!remaining.empty()) {
+            auto const [line_raw, rest] = split_next_line(remaining);
+            remaining = rest;
+            auto const line = trim_ascii_whitespace(line_raw);
 
             // skip empty lines and comments
             if (line.empty() || line.front() == '#') {
@@ -202,79 +217,70 @@ namespace rdf4cpp::parser {
             // that appear outside of IRIs and literals correctly.
             int term_count = 0;
             bool found_dot = false;
-            size_t i = 0;
-            while (i < line.size()) {
+            auto cursor = line;
+            while (!cursor.empty()) {
                 // skip whitespace
-                while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) {
-                    ++i;
-                }
-                if (i >= line.size()) {
+                cursor = ltrim_ascii_whitespace(cursor);
+                if (cursor.empty()) {
                     break;
                 }
-                char c = line[i];
+                char const c = cursor.front();
 
                 if (c == '.') {
                     found_dot = true;
-                    ++i;
-                    // skip trailing whitespace and optional comment after .
-                    while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) {
-                        ++i;
-                    }
-                    if (i < line.size() && line[i] == '#') {
-                        // inline comment after dot — valid
-                    }
+                    cursor.remove_prefix(1);
                     break;
                 } else if (c == '#') {
                     // comment at top level (outside IRI/literal) — not valid N-Triples
-                    // unless we already found the dot
                     return {ParsingFlag::Auto, GuessConfidence::None};
                 } else if (c == '<') {
                     // IRI — find closing >
-                    auto close = line.find('>', i);
+                    auto const close = cursor.find('>');
                     if (close == std::string_view::npos) {
                         break;
                     }
-                    i = close + 1;
+                    cursor.remove_prefix(close + 1);
                     ++term_count;
-                } else if (c == '_' && i + 1 < line.size() && line[i + 1] == ':') {
+                } else if (cursor.starts_with("_:")) {
                     // blank node — skip to next whitespace
-                    while (i < line.size() && line[i] != ' ' && line[i] != '\t') {
-                        ++i;
-                    }
+                    auto const ws = std::ranges::find_if(cursor, is_ascii_ws);
+                    cursor.remove_prefix(static_cast<size_t>(ws - cursor.begin()));
                     ++term_count;
                 } else if (c == '"') {
                     // literal — find unescaped closing quote, then skip datatype/lang
-                    ++i;
-                    while (i < line.size()) {
-                        if (line[i] == '\\') {
-                            i += 2;
+                    cursor.remove_prefix(1);
+                    while (!cursor.empty()) {
+                        if (cursor.front() == '\\') {
+                            cursor.remove_prefix(std::min<size_t>(2, cursor.size()));
                             continue;
                         }
-                        if (line[i] == '"') {
+                        if (cursor.front() == '"') {
                             break;
                         }
-                        ++i;
+                        cursor.remove_prefix(1);
                     }
-                    if (i < line.size()) {
-                        ++i;  // skip closing quote
+                    if (!cursor.empty()) {
+                        cursor.remove_prefix(1);  // skip closing quote
                     }
                     // skip ^^<datatype> or @lang (which may contain # inside <...>)
-                    if (i + 1 < line.size() && line[i] == '^' && line[i + 1] == '^') {
-                        i += 2;
-                        if (i < line.size() && line[i] == '<') {
-                            auto close = line.find('>', i);
+                    if (cursor.starts_with("^^")) {
+                        cursor.remove_prefix(2);
+                        if (!cursor.empty() && cursor.front() == '<') {
+                            auto const close = cursor.find('>');
                             if (close != std::string_view::npos) {
-                                i = close + 1;
+                                cursor.remove_prefix(close + 1);
                             }
                         } else {
-                            while (i < line.size() && line[i] != ' ' && line[i] != '\t' && line[i] != '.') {
-                                ++i;
-                            }
-                        }
-                    } else if (i < line.size() && line[i] == '@') {
-                        while (i < line.size() && line[i] != ' ' && line[i] != '\t' && line[i] != '.') {
-                            ++i;
+                            auto const end = std::ranges::find_if(cursor, [](char ch) {
+                                return is_ascii_ws(ch) || ch == '.';
+                            });
+                            cursor.remove_prefix(static_cast<size_t>(end - cursor.begin()));
                         }
+                    } else if (!cursor.empty() && cursor.front() == '@') {
+                        auto const end = std::ranges::find_if(cursor, [](char ch) {
+                            return is_ascii_ws(ch) || ch == '.';
+                        });
+                        cursor.remove_prefix(static_cast<size_t>(end - cursor.begin()));
                     }
                     ++term_count;
                 } else {
@@ -306,30 +312,27 @@ namespace rdf4cpp::parser {
         return {ParsingFlag::NTriples, GuessConfidence::Medium};
     }
 
-    static std::string_view skip_comments(std::string_view sv) {
+    static std::string_view skip_comments(std::string_view const sv) {
         // skip lines starting with # (comments in N-Triples/Turtle/TriG)
-        while (!sv.empty() && sv.front() == '#') {
-            auto eol = sv.find('\n');
+        auto cursor = sv;
+        while (!cursor.empty() && cursor.front() == '#') {
+            auto const eol = cursor.find('\n');
             if (eol == std::string_view::npos) {
                 return {};
             }
-            sv.remove_prefix(eol + 1);
-            // skip whitespace after comment line
-            while (!sv.empty() && (sv.front() == ' ' || sv.front() == '\t' || sv.front() == '\n' || sv.front() == '\r')) {
-                sv.remove_prefix(1);
-            }
+            cursor = ltrim_ascii_whitespace(cursor.substr(eol + 1));
         }
-        return sv;
+        return cursor;
     }
 
-    FormatGuess guess_format_from_content(std::string_view prefix) noexcept {
-        auto full_content = skip_whitespace_and_bom(prefix);
+    FormatGuess guess_format_from_content(std::string_view const prefix) noexcept {
+        auto const full_content = skip_whitespace_and_bom(prefix);
         if (full_content.empty()) {
             return {ParsingFlag::Auto, GuessConfidence::None};
         }
 
         // Skip leading comment lines for the first-byte checks
-        auto content = skip_comments(full_content);
+        auto const content = skip_comments(full_content);
         if (content.empty()) {
             return {ParsingFlag::Auto, GuessConfidence::None};
         }
@@ -344,13 +347,7 @@ namespace rdf4cpp::parser {
         // JSON-based formats — but `{` can also be a TriG default graph block,
         // and `[` can be a TriG blank node graph name or a Turtle blank node property list.
         if (content.front() == '[') {
-            auto after_bracket = content.substr(1);
-            while (!after_bracket.empty()
-                   && (after_bracket.front() == ' ' || after_bracket.front() == '\t' || after_bracket.front() == '\n'
-                       || after_bracket.front() == '\r'))
-            {
-                after_bracket.remove_prefix(1);
-            }
+            auto const after_bracket = ltrim_ascii_whitespace(content.substr(1));
             // JSON arrays start with `[` followed by `{`, `"`, `[`, number, true, false, null
             // Turtle/TriG blank nodes: `[]` or `[ predicate object ]`
             if (!after_bracket.empty() && (after_bracket.front() == ']' || after_bracket.front() == '<' || after_bracket.front() == '_')) {
@@ -363,13 +360,7 @@ namespace rdf4cpp::parser {
             return sniff_json_content(content);
         }
         if (content.front() == '{') {
-            auto after_brace = content.substr(1);
-            while (!after_brace.empty()
-                   && (after_brace.front() == ' ' || after_brace.front() == '\t' || after_brace.front() == '\n'
-                       || after_brace.front() == '\r'))
-            {
-                after_brace.remove_prefix(1);
-            }
+            auto const after_brace = ltrim_ascii_whitespace(content.substr(1));
             if (after_brace.empty() || after_brace.front() == '"') {
                 return sniff_json_content(content);
             }
@@ -394,8 +385,8 @@ namespace rdf4cpp::parser {
         }
 
         // Phase 2: try N-Triples / N-Quads line-based detection
-        if (content.front() == '<' || (content.front() == '_' && content.size() > 1 && content[1] == ':')) {
-            auto result = sniff_ntriples_or_nquads(full_content);
+        if (content.front() == '<' || content.starts_with("_:")) {
+            auto const result = sniff_ntriples_or_nquads(full_content);
             if (result.is_known()) {
                 return result;
             }
@@ -406,50 +397,60 @@ namespace rdf4cpp::parser {
         // Phase 3: check for Turtle/TriG syntax markers in content that didn't
         // match any earlier patterns (e.g. Turtle without @prefix directives)
         {
+            static constexpr std::string_view turtle_markers = ";,()[]{}";
             bool has_turtle_marker = false;
             bool in_iri = false;
             bool in_string = false;
             char string_delim = 0;
+            char prev_char = 0;
 
-            for (size_t i = 0; i < content.size(); ++i) {
-                char c = content[i];
+            auto cursor = content;
+            while (!cursor.empty()) {
+                char const c = cursor.front();
+                cursor.remove_prefix(1);
                 if (in_string) {
-                    if (c == '\\' && i + 1 < content.size()) {
-                        ++i;
+                    if (c == '\\' && !cursor.empty()) {
+                        prev_char = cursor.front();
+                        cursor.remove_prefix(1);
                         continue;
                     }
                     if (c == string_delim) {
                         in_string = false;
                     }
+                    prev_char = c;
                     continue;
                 }
                 if (in_iri) {
                     if (c == '>') {
                         in_iri = false;
                     }
+                    prev_char = c;
                     continue;
                 }
                 if (c == '<') {
                     in_iri = true;
+                    prev_char = c;
                     continue;
                 }
                 if (c == '"' || c == '\'') {
                     in_string = true;
                     string_delim = c;
+                    prev_char = c;
                     continue;
                 }
                 // Turtle/TriG syntax markers not valid in N-Triples
-                if (c == ';' || c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}') {
+                if (turtle_markers.find(c) != std::string_view::npos) {
                     has_turtle_marker = true;
                     break;
                 }
                 // bare `a` surrounded by whitespace = Turtle shorthand for rdf:type
-                if (c == 'a' && i > 0 && (content[i - 1] == ' ' || content[i - 1] == '\t') && i + 1 < content.size()
-                    && (content[i + 1] == ' ' || content[i + 1] == '\t'))
+                if (c == 'a' && (prev_char == ' ' || prev_char == '\t') && !cursor.empty()
+                    && (cursor.front() == ' ' || cursor.front() == '\t'))
                 {
                     has_turtle_marker = true;
                     break;
                 }
+                prev_char = c;
             }
 
             if (has_turtle_marker) {
@@ -463,9 +464,9 @@ namespace rdf4cpp::parser {
         return {ParsingFlag::Auto, GuessConfidence::None};
     }
 
-    FormatGuess guess_format(std::string_view file_path, std::string_view prefix) noexcept {
-        auto ext_guess = guess_format_from_path(file_path);
-        auto content_guess = guess_format_from_content(prefix);
+    FormatGuess guess_format(std::string_view const file_path, std::string_view const prefix) noexcept {
+        auto const ext_guess = guess_format_from_path(file_path);
+        auto const content_guess = guess_format_from_content(prefix);
 
         // If extension gives a strong match and no content sniffing needed
         if (ext_guess.confidence == GuessConfidence::High) {
diff --git a/src/rdf4cpp/parser/FormatGuess.md b/src/rdf4cpp/parser/FormatGuess.md
new file mode 100644
index 000000000..9477eddba
--- /dev/null
+++ b/src/rdf4cpp/parser/FormatGuess.md
@@ -0,0 +1,80 @@
+# FormatGuess — RDF Serialization Format Detection
+
+## Purpose
+
+`FormatGuess` provides automatic detection of RDF serialization formats from
+file extensions and/or a content prefix (the first few hundred to few thousand
+bytes). It returns a `FormatGuess` consisting of a `ParsingFlag` (the detected
+syntax) and a `GuessConfidence` level.
+
+## Detection Strategy
+
+Three entry points, in order of specificity:
+
+| Function                        | Input          | Returned confidence levels                                                                                                                                         |
+|---------------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `guess_format_from_extension()` | file extension | High (`.ttl`, `.nt`, …), Low (`.owl`, `.xml`), or None                                                                                                             |
+| `guess_format_from_content()`   | byte prefix    | High (XML root, `@prefix`, JSON-LD keywords), Medium (`PREFIX`/`BASE`, N-Triples/N-Quads grammar, TriG markers), Low (Turtle syntax markers, generic XML), or None |
+| `guess_format()`                | path + prefix  | Certain (extension + content agree), otherwise delegates to the above                                                                                              |
+
+`guess_format()` combines extension and content results: when both agree the
+confidence is boosted to **Certain**.
+
+## Content Sniffing Phases
+
+`guess_format_from_content()` inspects the prefix in three ordered phases.
+Processing stops at the first match.
+
+### Phase 1 — Deterministic Checks
+
+Fast tests on the first non-whitespace, non-comment bytes:
+
+* `<?xml`, `<rdf:RDF`, `<rdf:` → XML sub-classifier (`sniff_xml_content`)
+* `[` / `{` → disambiguate JSON-LD vs Turtle/TriG based on what follows
+* `@prefix` / `@base` → Turtle (or TriG if GRAPH / `{` found elsewhere)
+* `PREFIX` / `BASE` (case-insensitive) → Turtle/TriG
+
+### Phase 2 — N-Triples / N-Quads Grammar
+
+If the prefix starts with `<` or `_:`, attempt a strict line-by-line parse
+(`sniff_ntriples_or_nquads`). Each line must contain exactly 3 or 4 terms
+followed by a dot. If any line fails, fall through to Phase 3.
+
+* 3-term lines only → N-Triples (Medium)
+* Any 4-term line → N-Quads (Medium)
+
+### Phase 3 — Turtle / TriG Markers
+
+Scan for syntax characters that are valid in Turtle/TriG but not in
+N-Triples: `;` `,` `(` `)` `[` `]` `{` `}` and the bare keyword `a`
+(rdf:type shorthand). Strings and IRIs are skipped to avoid false matches.
+
+If markers are found, check for TriG-specific patterns (GRAPH keyword or
+`{` outside strings) and return Turtle or TriG at Low confidence.
+
+## Encoding Assumptions
+
+All RDF syntax delimiters scanned by this module are ASCII bytes (0x00–0x7F).
+In valid UTF-8, bytes in this range never appear as continuation bytes
+(0x80–0xBF), so byte-level scanning for these markers is safe without full
+UTF-8 decoding. Case-insensitive comparisons (e.g. for the GRAPH keyword and
+file extensions) use `una::cases::to_lowercase_utf8()` for correctness.
+
+## Confidence Levels
+
+| Level   | Meaning                                                   |
+|---------|-----------------------------------------------------------|
+| None    | No guess could be made                                    |
+| Low     | Weak heuristic (ambiguous extension, syntax markers only) |
+| Medium  | Good signal from content sniffing                         |
+| High    | Unambiguous extension or strong content match             |
+| Certain | Extension and content agree                               |
+
+## Format Precedence
+
+When extension and content disagree:
+
+* **High-confidence extension** wins over content.
+* **Low-confidence extension** (`.owl`, `.xml`) defers to content if content
+  produces a known result.
+* **No extension** relies entirely on content.

From bd6e5a50da02a30f8cc10fa2909c97eaa2215528 Mon Sep 17 00:00:00 2001
From: bigerl <alexander@bigerl.eu>
Date: Sun, 1 Mar 2026 20:16:13 +0100
Subject: [PATCH 3/7] fix docs

---
 src/rdf4cpp/parser/FormatGuess.hpp |  1 +
 src/rdf4cpp/parser/FormatGuess.md  | 20 +++++++++++---------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/rdf4cpp/parser/FormatGuess.hpp b/src/rdf4cpp/parser/FormatGuess.hpp
index 268734e7d..e56b18480 100644
--- a/src/rdf4cpp/parser/FormatGuess.hpp
+++ b/src/rdf4cpp/parser/FormatGuess.hpp
@@ -50,6 +50,7 @@ namespace rdf4cpp::parser {
     /**
      * Guess the RDF serialization format by inspecting a prefix of the file content.
      * At least 512 bytes recommended, 4096 bytes ideal.
+     * Strips a leading UTF-8 BOM and skips whitespace and #-comment lines before sniffing.
      */
     [[nodiscard]] FormatGuess guess_format_from_content(std::string_view prefix) noexcept;
 
diff --git a/src/rdf4cpp/parser/FormatGuess.md b/src/rdf4cpp/parser/FormatGuess.md
index 9477eddba..4a34e723c 100644
--- a/src/rdf4cpp/parser/FormatGuess.md
+++ b/src/rdf4cpp/parser/FormatGuess.md
@@ -11,19 +11,20 @@ syntax) and a `GuessConfidence` level.
 
 Three entry points, in order of specificity:
 
-| Function                        | Input          | Returned confidence levels                                                                                                                                         |
-|---------------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `guess_format_from_extension()` | file extension | High (`.ttl`, `.nt`, …), Low (`.owl`, `.xml`), or None                                                                                                             |
-| `guess_format_from_content()`   | byte prefix    | High (XML root, `@prefix`, JSON-LD keywords), Medium (`PREFIX`/`BASE`, N-Triples/N-Quads grammar, TriG markers), Low (Turtle syntax markers, generic XML), or None |
-| `guess_format()`                | path + prefix  | Certain (extension + content agree), otherwise delegates to the above                                                                                              |
+| Function                        | Input          | Returned confidence levels                                                                                                                                              |
+|---------------------------------|----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `guess_format_from_extension()` | file extension | High (`.ttl`, `.nt`, …), Low (`.owl`, `.xml`), or None                                                                                                                  |
+| `guess_format_from_content()`   | byte prefix    | High (XML root, `@prefix`, JSON-LD keywords), Medium (`PREFIX`/`BASE`, N-Triples/N-Quads grammar, TriG markers), Low (Turtle/TriG syntax markers, generic XML), or None |
+| `guess_format()`                | path + prefix  | Certain (extension + content agree), otherwise delegates to the above                                                                                                   |
 
 `guess_format()` combines extension and content results: when both agree the
 confidence is boosted to **Certain**.
 
 ## Content Sniffing Phases
 
-`guess_format_from_content()` inspects the prefix in three ordered phases.
-Processing stops at the first match.
+`guess_format_from_content()` first strips a UTF-8 BOM (if present) and
+leading whitespace, then skips leading `#`-comment lines before inspecting
+the prefix in three ordered phases. Processing stops at the first match.
 
 ### Phase 1 — Deterministic Checks
 
@@ -49,8 +50,9 @@ Scan for syntax characters that are valid in Turtle/TriG but not in
 N-Triples: `;` `,` `(` `)` `[` `]` `{` `}` and the bare keyword `a`
 (rdf:type shorthand). Strings and IRIs are skipped to avoid false matches.
 
-If markers are found, check for TriG-specific patterns (GRAPH keyword or
-`{` outside strings) and return Turtle or TriG at Low confidence.
+If markers are found, check for TriG-specific patterns (case-insensitive
+GRAPH keyword or `{` outside strings) and return Turtle or TriG at Low
+confidence.
 
 ## Encoding Assumptions
 

From 745515349de8bff232fde8c50bcae54b2ad957c5 Mon Sep 17 00:00:00 2001
From: bigerl <alexander@bigerl.eu>
Date: Sun, 1 Mar 2026 20:31:54 +0100
Subject: [PATCH 4/7] refactoring

---
 src/rdf4cpp/parser/FormatGuess.cpp | 221 ++++++++++++++---------------
 src/rdf4cpp/parser/FormatGuess.hpp |   2 +-
 tests/bench_SerDe.cpp              |   2 +-
 3 files changed, 111 insertions(+), 114 deletions(-)

diff --git a/src/rdf4cpp/parser/FormatGuess.cpp b/src/rdf4cpp/parser/FormatGuess.cpp
index e3422dda4..7261297fb 100644
--- a/src/rdf4cpp/parser/FormatGuess.cpp
+++ b/src/rdf4cpp/parser/FormatGuess.cpp
@@ -18,10 +18,6 @@ namespace rdf4cpp::parser {
     // UTF-8 continuation bytes.  Byte-level scanning for these markers is safe
     // in valid UTF-8.
 
-    static std::string to_lower(std::string_view const sv) {
-        return una::cases::to_lowercase_utf8(sv);
-    }
-
     static bool is_ascii_ws(char const c) noexcept {
         return c == ' ' || c == '\t' || c == '\n' || c == '\r';
     }
@@ -48,7 +44,7 @@ namespace rdf4cpp::parser {
         return {.line = sv.substr(0, eol), .rest = sv.substr(eol + 1)};
     }
 
-    static std::string_view skip_whitespace_and_bom(std::string_view const sv) {
+    static std::string_view skip_whitespace_and_bom(std::string_view const sv) noexcept {
         // skip UTF-8 BOM
         if (sv.starts_with("\xEF\xBB\xBF")) {
             return ltrim_ascii_whitespace(sv.substr(3));
@@ -68,18 +64,13 @@ namespace rdf4cpp::parser {
     }
 
     static bool contains_icase(std::string_view const haystack, std::string_view const needle) {
-        if (needle.size() > haystack.size()) {
-            return false;
-        }
-        auto const lower_hay = to_lower(haystack);
-        auto const lower_needle = to_lower(needle);
-        return lower_hay.find(lower_needle) != std::string::npos;
+        return static_cast<bool>(una::caseless::find_utf8(haystack, needle));
     }
 
     // --- extension mapping ---
 
     FormatGuess guess_format_from_extension(std::string_view const extension) noexcept {
-        auto const ext = to_lower(extension);
+        auto const ext = una::cases::to_lowercase_utf8(extension);
 
         if (ext == ".ttl" || ext == ".turtle") {
             return {ParsingFlag::Turtle, GuessConfidence::High};
@@ -116,7 +107,7 @@ namespace rdf4cpp::parser {
         auto const last_sep = file_path.find_last_of("/\\");
         auto const filename = (last_sep != std::string_view::npos) ? file_path.substr(last_sep + 1) : file_path;
 
-        // find last dot in filename
+        // find the last dot in the filename
         auto const dot_pos = filename.rfind('.');
         if (dot_pos == std::string_view::npos) {
             return {ParsingFlag::Auto, GuessConfidence::None};
@@ -133,8 +124,8 @@ namespace rdf4cpp::parser {
             return true;
         }
 
-        // Look for pattern like IRI/prefixed-name followed by {
-        // or just { at start of a line (default graph block)
+        // Look for a pattern like IRI/prefixed-name followed by {
+        // or just { at the start of a line (default graph block)
         bool in_string = false;
         char string_delim = 0;
         auto cursor = content;
@@ -214,7 +205,7 @@ namespace rdf4cpp::parser {
             }
 
             // Count terms by walking through the line, handling # comments
-            // that appear outside of IRIs and literals correctly.
+            // that appear outside IRIs and literals correctly.
             int term_count = 0;
             bool found_dot = false;
             auto cursor = line;
@@ -224,14 +215,13 @@ namespace rdf4cpp::parser {
                 if (cursor.empty()) {
                     break;
                 }
-                char const c = cursor.front();
 
-                if (c == '.') {
+                if (char const c = cursor.front(); c == '.') {
                     found_dot = true;
                     cursor.remove_prefix(1);
                     break;
                 } else if (c == '#') {
-                    // comment at top level (outside IRI/literal) — not valid N-Triples
+                    // comment at the top level (outside IRI/literal) — not valid N-Triples
                     return {ParsingFlag::Auto, GuessConfidence::None};
                 } else if (c == '<') {
                     // IRI — find closing >
@@ -242,7 +232,7 @@ namespace rdf4cpp::parser {
                     cursor.remove_prefix(close + 1);
                     ++term_count;
                 } else if (cursor.starts_with("_:")) {
-                    // blank node — skip to next whitespace
+                    // blank node — skip to the next whitespace
                     auto const ws = std::ranges::find_if(cursor, is_ascii_ws);
                     cursor.remove_prefix(static_cast<size_t>(ws - cursor.begin()));
                     ++term_count;
@@ -266,18 +256,17 @@ namespace rdf4cpp::parser {
                     if (cursor.starts_with("^^")) {
                         cursor.remove_prefix(2);
                         if (!cursor.empty() && cursor.front() == '<') {
-                            auto const close = cursor.find('>');
-                            if (close != std::string_view::npos) {
+                            if (auto const close = cursor.find('>'); close != std::string_view::npos) {
                                 cursor.remove_prefix(close + 1);
                             }
                         } else {
-                            auto const end = std::ranges::find_if(cursor, [](char ch) {
+                            auto const end = std::ranges::find_if(cursor, [](char const ch) noexcept {
                                 return is_ascii_ws(ch) || ch == '.';
                             });
                             cursor.remove_prefix(static_cast<size_t>(end - cursor.begin()));
                         }
                     } else if (!cursor.empty() && cursor.front() == '@') {
-                        auto const end = std::ranges::find_if(cursor, [](char ch) {
+                        auto const end = std::ranges::find_if(cursor, [](char const ch) noexcept {
                             return is_ascii_ws(ch) || ch == '.';
                         });
                         cursor.remove_prefix(static_cast<size_t>(end - cursor.begin()));
@@ -325,6 +314,95 @@ namespace rdf4cpp::parser {
         return cursor;
     }
 
+    static FormatGuess sniff_bracket_content(std::string_view const content, std::string_view const full_content) {
+        auto const after_bracket = ltrim_ascii_whitespace(content.substr(1));
+        // JSON arrays start with `[` followed by `{`, `"`, `[`, number, true, false, null
+        // Turtle/TriG blank nodes: `[]` or `[ predicate object ]`
+        if (!after_bracket.empty() && (after_bracket.front() == ']' || after_bracket.front() == '<' || after_bracket.front() == '_')) {
+            // Likely Turtle/TriG blank node
+            if (has_trig_markers(full_content)) {
+                return {ParsingFlag::TriG, GuessConfidence::Medium};
+            }
+            return {ParsingFlag::Turtle, GuessConfidence::Low};
+        }
+        return sniff_json_content(content);
+    }
+
+    static FormatGuess sniff_brace_content(std::string_view const content) {
+        auto const after_brace = ltrim_ascii_whitespace(content.substr(1));
+        if (after_brace.empty() || after_brace.front() == '"') {
+            return sniff_json_content(content);
+        }
+        // Likely TriG — `{` followed by non-JSON content
+        return {ParsingFlag::TriG, GuessConfidence::Medium};
+    }
+
+    static FormatGuess sniff_turtle_or_trig_markers(std::string_view const content, std::string_view const full_content) {
+        static constexpr std::string_view turtle_markers = ";,()[]{}";
+        bool has_turtle_marker = false;
+        bool in_iri = false;
+        bool in_string = false;
+        char string_delim = 0;
+        char prev_char = 0;
+
+        auto cursor = content;
+        while (!cursor.empty()) {
+            char const c = cursor.front();
+            cursor.remove_prefix(1);
+            if (in_string) {
+                if (c == '\\' && !cursor.empty()) {
+                    prev_char = cursor.front();
+                    cursor.remove_prefix(1);
+                    continue;
+                }
+                if (c == string_delim) {
+                    in_string = false;
+                }
+                prev_char = c;
+                continue;
+            }
+            if (in_iri) {
+                if (c == '>') {
+                    in_iri = false;
+                }
+                prev_char = c;
+                continue;
+            }
+            if (c == '<') {
+                in_iri = true;
+                prev_char = c;
+                continue;
+            }
+            if (c == '"' || c == '\'') {
+                in_string = true;
+                string_delim = c;
+                prev_char = c;
+                continue;
+            }
+            // Turtle/TriG syntax markers not valid in N-Triples
+            if (turtle_markers.find(c) != std::string_view::npos) {
+                has_turtle_marker = true;
+                break;
+            }
+            // bare `a` surrounded by whitespace = Turtle shorthand for rdf:type
+            if (c == 'a' && (prev_char == ' ' || prev_char == '\t') && !cursor.empty() && (cursor.front() == ' ' || cursor.front() == '\t'))
+            {
+                has_turtle_marker = true;
+                break;
+            }
+            prev_char = c;
+        }
+
+        if (has_turtle_marker) {
+            if (has_trig_markers(full_content)) {
+                return {ParsingFlag::TriG, GuessConfidence::Low};
+            }
+            return {ParsingFlag::Turtle, GuessConfidence::Low};
+        }
+
+        return {ParsingFlag::Auto, GuessConfidence::None};
+    }
+
     FormatGuess guess_format_from_content(std::string_view const prefix) noexcept {
         auto const full_content = skip_whitespace_and_bom(prefix);
         if (full_content.empty()) {
@@ -347,25 +425,10 @@ namespace rdf4cpp::parser {
         // JSON-based formats — but `{` can also be a TriG default graph block,
         // and `[` can be a TriG blank node graph name or a Turtle blank node property list.
         if (content.front() == '[') {
-            auto const after_bracket = ltrim_ascii_whitespace(content.substr(1));
-            // JSON arrays start with `[` followed by `{`, `"`, `[`, number, true, false, null
-            // Turtle/TriG blank nodes: `[]` or `[ predicate object ]`
-            if (!after_bracket.empty() && (after_bracket.front() == ']' || after_bracket.front() == '<' || after_bracket.front() == '_')) {
-                // Likely Turtle/TriG blank node
-                if (has_trig_markers(full_content)) {
-                    return {ParsingFlag::TriG, GuessConfidence::Medium};
-                }
-                return {ParsingFlag::Turtle, GuessConfidence::Low};
-            }
-            return sniff_json_content(content);
+            return sniff_bracket_content(content, full_content);
         }
         if (content.front() == '{') {
-            auto const after_brace = ltrim_ascii_whitespace(content.substr(1));
-            if (after_brace.empty() || after_brace.front() == '"') {
-                return sniff_json_content(content);
-            }
-            // Likely TriG — `{` followed by non-JSON content
-            return {ParsingFlag::TriG, GuessConfidence::Medium};
+            return sniff_brace_content(content);
         }
 
         // Turtle directives (case-sensitive @prefix/@base)
@@ -386,8 +449,7 @@ namespace rdf4cpp::parser {
 
         // Phase 2: try N-Triples / N-Quads line-based detection
         if (content.front() == '<' || content.starts_with("_:")) {
-            auto const result = sniff_ntriples_or_nquads(full_content);
-            if (result.is_known()) {
+            if (auto const result = sniff_ntriples_or_nquads(full_content); result.is_known()) {
                 return result;
             }
             // If N-Triples detection failed, the content starts with `<` or `_:`
@@ -395,73 +457,8 @@ namespace rdf4cpp::parser {
         }
 
         // Phase 3: check for Turtle/TriG syntax markers in content that didn't
-        // match any earlier patterns (e.g. Turtle without @prefix directives)
-        {
-            static constexpr std::string_view turtle_markers = ";,()[]{}";
-            bool has_turtle_marker = false;
-            bool in_iri = false;
-            bool in_string = false;
-            char string_delim = 0;
-            char prev_char = 0;
-
-            auto cursor = content;
-            while (!cursor.empty()) {
-                char const c = cursor.front();
-                cursor.remove_prefix(1);
-                if (in_string) {
-                    if (c == '\\' && !cursor.empty()) {
-                        prev_char = cursor.front();
-                        cursor.remove_prefix(1);
-                        continue;
-                    }
-                    if (c == string_delim) {
-                        in_string = false;
-                    }
-                    prev_char = c;
-                    continue;
-                }
-                if (in_iri) {
-                    if (c == '>') {
-                        in_iri = false;
-                    }
-                    prev_char = c;
-                    continue;
-                }
-                if (c == '<') {
-                    in_iri = true;
-                    prev_char = c;
-                    continue;
-                }
-                if (c == '"' || c == '\'') {
-                    in_string = true;
-                    string_delim = c;
-                    prev_char = c;
-                    continue;
-                }
-                // Turtle/TriG syntax markers not valid in N-Triples
-                if (turtle_markers.find(c) != std::string_view::npos) {
-                    has_turtle_marker = true;
-                    break;
-                }
-                // bare `a` surrounded by whitespace = Turtle shorthand for rdf:type
-                if (c == 'a' && (prev_char == ' ' || prev_char == '\t') && !cursor.empty()
-                    && (cursor.front() == ' ' || cursor.front() == '\t'))
-                {
-                    has_turtle_marker = true;
-                    break;
-                }
-                prev_char = c;
-            }
-
-            if (has_turtle_marker) {
-                if (has_trig_markers(full_content)) {
-                    return {ParsingFlag::TriG, GuessConfidence::Low};
-                }
-                return {ParsingFlag::Turtle, GuessConfidence::Low};
-            }
-        }
-
-        return {ParsingFlag::Auto, GuessConfidence::None};
+        // match any earlier patterns (e.g., Turtle without @prefix directives)
+        return sniff_turtle_or_trig_markers(content, full_content);
     }
 
     FormatGuess guess_format(std::string_view const file_path, std::string_view const prefix) noexcept {
@@ -474,11 +471,11 @@ namespace rdf4cpp::parser {
             if (content_guess.is_known() && content_guess.syntax == ext_guess.syntax) {
                 return {ext_guess.syntax, GuessConfidence::Certain};
             }
-            // Extension is high confidence — trust it even if content is ambiguous
+            // Extension is high confidence — trust it even if the content is ambiguous
             return ext_guess;
         }
 
-        // Low confidence extension (e.g. .owl, .xml) — need content disambiguation
+        // Low confidence extension (e.g., .owl, .xml) — need content disambiguation
         if (ext_guess.confidence == GuessConfidence::Low) {
             if (content_guess.is_known()) {
                 // Content overrides ambiguous extension
diff --git a/src/rdf4cpp/parser/FormatGuess.hpp b/src/rdf4cpp/parser/FormatGuess.hpp
index e56b18480..8a17223d4 100644
--- a/src/rdf4cpp/parser/FormatGuess.hpp
+++ b/src/rdf4cpp/parser/FormatGuess.hpp
@@ -13,7 +13,7 @@ namespace rdf4cpp::parser {
      */
     enum struct GuessConfidence : uint8_t {
         None = 0,  ///< no guess could be made
-        Low,       ///< weak heuristic match (e.g. ambiguous extension like .owl)
+        Low,       ///< weak heuristic match (e.g., ambiguous extension like .owl)
         Medium,    ///< content sniffing with good signal
         High,      ///< file extension match or strong content match
         Certain,   ///< unambiguous (extension + content agree)
diff --git a/tests/bench_SerDe.cpp b/tests/bench_SerDe.cpp
index a221b5145..e620a059a 100644
--- a/tests/bench_SerDe.cpp
+++ b/tests/bench_SerDe.cpp
@@ -6,7 +6,7 @@
 #include <rdf4cpp/storage/reference_node_storage/UnsyncReferenceNodeStorage.hpp>
 
 void download_swdf(std::filesystem::path const &base) {
-    auto curl_cmd = std::format("wget -P '{}' https://hobbitdata.informatik.uni-leipzig.de/ISWC2020_Tentris/swdf.zip", base.c_str());
+    auto curl_cmd = std::format("wget -P '{}' https://files.dice-research.org/datasets/ISWC2020_Tentris/swdf.zip", base.c_str());
     std::system(curl_cmd.c_str());
 
     auto const swdf_path = base / "swdf.zip";

From 9cc710d8de7a2db045f524ce27bc3a82a1441cfe Mon Sep 17 00:00:00 2001
From: bigerl <alexander@bigerl.eu>
Date: Sun, 1 Mar 2026 21:07:49 +0100
Subject: [PATCH 5/7] use filesystem

---
 src/rdf4cpp/parser/FormatGuess.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/rdf4cpp/parser/FormatGuess.cpp b/src/rdf4cpp/parser/FormatGuess.cpp
index 7261297fb..955275bd4 100644
--- a/src/rdf4cpp/parser/FormatGuess.cpp
+++ b/src/rdf4cpp/parser/FormatGuess.cpp
@@ -1,6 +1,7 @@
 #include "FormatGuess.hpp"
 
 #include <algorithm>
+#include <filesystem>
 #include <string>
 
 #include <uni_algo/all.h>
@@ -103,17 +104,11 @@ namespace rdf4cpp::parser {
     }
 
     FormatGuess guess_format_from_path(std::string_view const file_path) noexcept {
-        // find last path separator
-        auto const last_sep = file_path.find_last_of("/\\");
-        auto const filename = (last_sep != std::string_view::npos) ? file_path.substr(last_sep + 1) : file_path;
-
-        // find the last dot in the filename
-        auto const dot_pos = filename.rfind('.');
-        if (dot_pos == std::string_view::npos) {
+        auto const ext = std::filesystem::path{file_path}.extension().string();
+        if (ext.empty()) {
             return {ParsingFlag::Auto, GuessConfidence::None};
         }
-
-        return guess_format_from_extension(filename.substr(dot_pos));
+        return guess_format_from_extension(ext);
     }
 
     // --- content sniffing ---

From b802941ad79ae046b0c48065505030555b538a73 Mon Sep 17 00:00:00 2001
From: bigerl <alexander@bigerl.eu>
Date: Mon, 2 Mar 2026 08:59:31 +0100
Subject: [PATCH 6/7] add missing file

---
 .../rdf4cpp/parser/PrefixBufferedReader.hpp   | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 private/rdf4cpp/parser/PrefixBufferedReader.hpp

diff --git a/private/rdf4cpp/parser/PrefixBufferedReader.hpp b/private/rdf4cpp/parser/PrefixBufferedReader.hpp
new file mode 100644
index 000000000..c4f10aa75
--- /dev/null
+++ b/private/rdf4cpp/parser/PrefixBufferedReader.hpp
@@ -0,0 +1,79 @@
+#ifndef RDF4CPP_PARSER_PRIVATE_PREFIXBUFFEREDREADER_HPP
+#define RDF4CPP_PARSER_PRIVATE_PREFIXBUFFEREDREADER_HPP
+
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include <rdf4cpp/parser/IStreamQuadIterator.hpp>
+
+namespace rdf4cpp::parser {
+
+    /**
+     * Wraps a C-like IO stream (void*, ReadFunc, ErrorFunc, EOFFunc) and
+     * serves a buffered prefix first, then delegates to the underlying stream.
+     *
+     * This is used when we peek at the start of a stream for content sniffing
+     * but need to replay those bytes for the actual parser.
+     */
+    struct PrefixBufferedReader {
+        void *underlying_stream;
+        ReadFunc underlying_read;
+        ErrorFunc underlying_error;
+        EOFFunc underlying_eof;
+
+        std::vector<char> prefix_buf;
+        size_t prefix_offset = 0;
+
+        PrefixBufferedReader(void *stream, ReadFunc read, ErrorFunc error, EOFFunc eof, std::vector<char> prefix)
+            : underlying_stream{stream},
+              underlying_read{read},
+              underlying_error{error},
+              underlying_eof{eof},
+              prefix_buf{std::move(prefix)} {
+        }
+
+        static size_t read_func(void *buffer, size_t elem_size, size_t count, void *voided_self) noexcept {
+            auto *self = static_cast<PrefixBufferedReader *>(voided_self);
+            auto *buf = static_cast<char *>(buffer);
+            size_t total_bytes = elem_size * count;
+            size_t bytes_read = 0;
+
+            // serve from the prefix buffer first
+            size_t const prefix_remaining = self->prefix_buf.size() - self->prefix_offset;
+            if (prefix_remaining > 0) {
+                size_t const from_prefix = std::min(total_bytes, prefix_remaining);
+                std::memcpy(buf, self->prefix_buf.data() + self->prefix_offset, from_prefix);
+                self->prefix_offset += from_prefix;
+                bytes_read += from_prefix;
+                total_bytes -= from_prefix;
+                buf += from_prefix;
+            }
+
+            // delegate remaining to the underlying stream
+            if (total_bytes > 0) {
+                bytes_read += self->underlying_read(buf, 1, total_bytes, self->underlying_stream);
+            }
+
+            return bytes_read;
+        }
+
+        static int error_func(void *voided_self) noexcept {
+            auto const *self = static_cast<PrefixBufferedReader *>(voided_self);
+            return self->underlying_error(self->underlying_stream);
+        }
+
+        static int eof_func(void *voided_self) noexcept {
+            auto const *self = static_cast<PrefixBufferedReader *>(voided_self);
+            // not at eof if we still have buffered prefix data
+            if (self->prefix_offset < self->prefix_buf.size()) {
+                return 0;
+            }
+            return self->underlying_eof(self->underlying_stream);
+        }
+    };
+
+}  // namespace rdf4cpp::parser
+
+#endif  // RDF4CPP_PARSER_PRIVATE_PREFIXBUFFEREDREADER_HPP

From b6c3435b4578757bfa9f2d0740a08637c6a68501 Mon Sep 17 00:00:00 2001
From: bigerl <alexander@bigerl.eu>
Date: Mon, 2 Mar 2026 10:27:14 +0100
Subject: [PATCH 7/7] fix leak

---
 private/rdf4cpp/parser/XMLParser.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/private/rdf4cpp/parser/XMLParser.hpp b/private/rdf4cpp/parser/XMLParser.hpp
index bf1e38d48..93af65643 100644
--- a/private/rdf4cpp/parser/XMLParser.hpp
+++ b/private/rdf4cpp/parser/XMLParser.hpp
@@ -32,6 +32,10 @@ namespace rdf4cpp::parser {
         // see https://github.com/NVIDIA/stdexec/issues/1143
         struct XmlParserCtxtDtorLambda {
             void operator()(xmlParserCtxt *c) const {
+                if (c != nullptr && c->myDoc != nullptr) {
+                    xmlFreeDoc(c->myDoc);
+                    c->myDoc = nullptr;
+                }
                 xmlFreeParserCtxt(c);
             }
         };