diff --git a/include/fkYAML/detail/output/serializer.hpp b/include/fkYAML/detail/output/serializer.hpp index 82fce1f6..257252db 100644 --- a/include/fkYAML/detail/output/serializer.hpp +++ b/include/fkYAML/detail/output/serializer.hpp @@ -262,10 +262,10 @@ class basic_serializer { str += m_tmp_str_buff; break; case node_type::STRING: { - bool is_escaped = false; - auto str_val = get_string_node_value(node, is_escaped); + bool needs_quotes = false; + auto str_val = get_string_node_value(node, needs_quotes); - if (is_escaped) { + if (needs_quotes) { // There's no other token type with escapes than strings. // Also, escapes must be in double-quoted strings. str += '\"'; @@ -366,15 +366,82 @@ class basic_serializer { return false; } + /// @brief Scan a string for YAML indicator characters that require quoting. + /// @param[in] str The already escaped string value to be inspected. + /// @return true if the string contains characters that could be interpreted + /// by a YAML parser as structural tokens (e.g., alias, anchor, tag, + /// comment, or flow indicators) or space at the beginning, and therefore + /// must be surrounded with quotes to preserve its semantic meaning; + /// false otherwise. + bool scan_yaml_tokens(const std::string& str) const { + if (str.empty()) { + // Empty plain scalar is technically valid, and escaper already + // decided we don't need quotes for whitespace/escapes. + return false; + } + + // The string contains spaces at the beginning or end of the stirng. " data " + if (str.front() == ' ') + { + return true; + } + if (str.back() == ' ') + { + return true; + } + + for (char c : str) { + switch (c) { + // Definitely dangerous as plain: will be lexed as alias/anchor/tag/comment + case '*': // alias: *foo + case '&': // anchor: &foo + case '!': // tag + case '#': // comment start + return true; + + // Structural indicators; legal in plain scalars in some positions, + // but conservative choice is to quote whenever present. + case '[': + case ']': + case '{': + case '}': + case ',': + case '|': + case '>': + case '@': + case '`': + case '%': + return true; + + // “Flow” indicators at beginning of scalar or before space are tricky. + // Being conservative, just quote if they appear anywhere. + case ':': + case '?': + case '-': + return true; + + default: + break; + } + } + + // No suspicious YAML indicator characters found, can stay plain. + return false; + } + /// @brief Get a string value from the given node and, if necessary, escape its contents. /// @param[in] node The target string YAML node. - /// @param[out] is_escaped Whether the contents of an output string has been escaped. + /// @param[out] need_quotes Whether the contents of an output string needs to be quoted. /// @return The (escaped) string node value. - typename BasicNodeType::string_type get_string_node_value(const BasicNodeType& node, bool& is_escaped) { + typename BasicNodeType::string_type get_string_node_value(const BasicNodeType& node, bool& need_quotes) const { FK_YAML_ASSERT(node.is_string()); const auto& s = node.as_str(); - return yaml_escaper::escape(s.c_str(), s.c_str() + s.size(), is_escaped); + auto result = yaml_escaper::escape(s.c_str(), s.c_str() + s.size(), need_quotes); + if (!need_quotes) { + need_quotes = scan_yaml_tokens(result); + } + return result; } // LCOV_EXCL_LINE private: diff --git a/single_include/fkYAML/node.hpp b/single_include/fkYAML/node.hpp index 05e79572..fbca663d 100644 --- a/single_include/fkYAML/node.hpp +++ b/single_include/fkYAML/node.hpp @@ -11000,10 +11000,10 @@ class basic_serializer { str += m_tmp_str_buff; break; case node_type::STRING: { - bool is_escaped = false; - auto str_val = get_string_node_value(node, is_escaped); + bool needs_quotes = false; + auto str_val = get_string_node_value(node, needs_quotes); - if (is_escaped) { + if (needs_quotes) { // There's no other token type with escapes than strings. // Also, escapes must be in double-quoted strings. str += '\"'; @@ -11104,15 +11104,82 @@ class basic_serializer { return false; } + /// @brief Scan a string for YAML indicator characters that require quoting. + /// @param[in] str The already escaped string value to be inspected. + /// @return true if the string contains characters that could be interpreted + /// by a YAML parser as structural tokens (e.g., alias, anchor, tag, + /// comment, or flow indicators) or space at the beginning, and therefore + /// must be surrounded with quotes to preserve its semantic meaning; + /// false otherwise. + bool scan_yaml_tokens(const std::string& str) const { + if (str.empty()) { + // Empty plain scalar is technically valid, and escaper already + // decided we don't need quotes for whitespace/escapes. + return false; + } + + // The string contains spaces at the beginning or end of the stirng. " data " + if (str.front() == ' ') + { + return true; + } + if (str.back() == ' ') + { + return true; + } + + for (char c : str) { + switch (c) { + // Definitely dangerous as plain: will be lexed as alias/anchor/tag/comment + case '*': // alias: *foo + case '&': // anchor: &foo + case '!': // tag + case '#': // comment start + return true; + + // Structural indicators; legal in plain scalars in some positions, + // but conservative choice is to quote whenever present. + case '[': + case ']': + case '{': + case '}': + case ',': + case '|': + case '>': + case '@': + case '`': + case '%': + return true; + + // “Flow” indicators at beginning of scalar or before space are tricky. + // Being conservative, just quote if they appear anywhere. + case ':': + case '?': + case '-': + return true; + + default: + break; + } + } + + // No suspicious YAML indicator characters found, can stay plain. + return false; + } + /// @brief Get a string value from the given node and, if necessary, escape its contents. /// @param[in] node The target string YAML node. - /// @param[out] is_escaped Whether the contents of an output string has been escaped. + /// @param[out] need_quotes Whether the contents of an output string needs to be quoted. /// @return The (escaped) string node value. - typename BasicNodeType::string_type get_string_node_value(const BasicNodeType& node, bool& is_escaped) { + typename BasicNodeType::string_type get_string_node_value(const BasicNodeType& node, bool& need_quotes) const { FK_YAML_ASSERT(node.is_string()); const auto& s = node.as_str(); - return yaml_escaper::escape(s.c_str(), s.c_str() + s.size(), is_escaped); + auto result = yaml_escaper::escape(s.c_str(), s.c_str() + s.size(), need_quotes); + if (!need_quotes) { + need_quotes = scan_yaml_tokens(result); + } + return result; } // LCOV_EXCL_LINE private: diff --git a/tests/unit_test/CMakeLists.txt b/tests/unit_test/CMakeLists.txt index bca79e0e..6b1b0c5b 100644 --- a/tests/unit_test/CMakeLists.txt +++ b/tests/unit_test/CMakeLists.txt @@ -244,6 +244,7 @@ add_executable( test_utf_encodings.cpp test_yaml_escaper_class.cpp test_yaml_version_type.cpp + test_string_serialize_deserialize.cpp main.cpp ) diff --git a/tests/unit_test/test_string_serialize_deserialize.cpp b/tests/unit_test/test_string_serialize_deserialize.cpp new file mode 100644 index 00000000..accb179e --- /dev/null +++ b/tests/unit_test/test_string_serialize_deserialize.cpp @@ -0,0 +1,55 @@ +// String serialize/deserialize round-trip tests + +#include + +#include + +TEST_CASE("String_Serialize_Deserialize_Roundtrip") { + fkyaml::node source = fkyaml::node::mapping(); + source["alias"] = "*anchor"; + source["anchor"] = "&anchor"; + source["tag"] = "!tag"; + source["comment"] = "#not-a-comment"; + source["brackets"] = "[1,2]"; + source["braces"] = "{a: 1}"; + source["comma"] = "a,b"; + source["pipe"] = "a|b"; + source["gt"] = "a>b"; + source["at"] = "user@host"; + source["backtick"] = "`code`"; + source["percent"] = "100%"; + source["colon"] = "key:value"; + source["question"] = "?maybe"; + source["dash"] = "-value"; + source["begin_space"] = " value"; + source["end_space"] = "value "; + source["space"] = " value "; + + // Values that should *not* trigger scane_yaml_tokens + source["plain"] = "just_text"; + + std::stringstream buff; + buff << source; + + fkyaml::node copy; + REQUIRE_NOTHROW(copy = fkyaml::node::deserialize(buff)); + REQUIRE(source["alias"] == copy["alias"]); + REQUIRE(source["anchor"] == copy["anchor"]); + REQUIRE(source["tag"] == copy["tag"]); + REQUIRE(source["comment"] == copy["comment"]); + REQUIRE(source["brackets"] == copy["brackets"]); + REQUIRE(source["braces"] == copy["braces"]); + REQUIRE(source["comma"] == copy["comma"]); + REQUIRE(source["pipe"] == copy["pipe"]); + REQUIRE(source["gt"] == copy["gt"]); + REQUIRE(source["at"] == copy["at"]); + REQUIRE(source["backtick"] == copy["backtick"]); + REQUIRE(source["percent"] == copy["percent"]); + REQUIRE(source["colon"] == copy["colon"]); + REQUIRE(source["question"] == copy["question"]); + REQUIRE(source["dash"] == copy["dash"]); + REQUIRE(source["plain"] == copy["plain"]); + REQUIRE(source["begin_space"] == copy["begin_space"]); + REQUIRE(source["end_space"] == copy["end_space"]); + REQUIRE(source["space"] == copy["space"]); +}