diff --git a/code/globalincs/pstypes.h b/code/globalincs/pstypes.h index 55d94de70b6..c8c504ccc53 100644 --- a/code/globalincs/pstypes.h +++ b/code/globalincs/pstypes.h @@ -47,6 +47,10 @@ #define DIR_SEPARATOR_STR "/" #endif +constexpr char COMMENT_CHAR = static_cast(';'); +constexpr char EOLN = static_cast(0x0a); +constexpr char CARRIAGE_RETURN = static_cast(0x0d); + #ifndef NDEBUG constexpr bool FSO_DEBUG = true; #else diff --git a/code/parse/parselo.h b/code/parse/parselo.h index 8640962302d..b501ff4e2cd 100644 --- a/code/parse/parselo.h +++ b/code/parse/parselo.h @@ -34,10 +34,6 @@ extern int fred_parse_flag; extern int Token_found_flag; -#define COMMENT_CHAR (char)';' -#define EOLN (char)0x0a -#define CARRIAGE_RETURN (char)0x0d - enum class LineEndingType { UNKNOWN, CR, CRLF, LF }; #define F_NAME 1 diff --git a/code/utils/unicode.cpp b/code/utils/unicode.cpp index 1229e63cb1a..6158160bd65 100644 --- a/code/utils/unicode.cpp +++ b/code/utils/unicode.cpp @@ -19,14 +19,20 @@ text_iterator::text_iterator(const char* in_current_byte, const char* in_range_s #endif } } + +constexpr auto warning_text = "Exception while %s near '%.16s': %s\n\nThis is most likely caused by text created in another encoding, such as Windows-1252, that cannot be interpreted as UTF-8."; + text_iterator& unicode::text_iterator::operator++() { if (Unicode_text_mode) { try { // Increment by UTF-8 encoded codepoints utf8::next(current_byte, range_end_byte); } catch(const std::exception& e) { - Error(LOCATION, "Exception while incrementing UTF-8 sequence near '%.16s': %s", current_byte, e.what()); - return *this; + Warning(LOCATION, warning_text, "incrementing text iterator", current_byte, e.what()); + // Increment by byte, so we still make progress + if (current_byte < range_end_byte) { + ++current_byte; + } } } else { // Increment by byte @@ -41,11 +47,14 @@ text_iterator& text_iterator::operator--() { // Decrement by UTF-8 encoded codepoints utf8::prior(current_byte, range_start_byte); } catch(const std::exception& e) { - Error(LOCATION, "Exception while decrementing text iterator near '%.16s': %s", current_byte, e.what()); - return *this; + Warning(LOCATION, warning_text, "decrementing text iterator", current_byte, e.what()); + // Decrement by byte, so we still make progress + if (current_byte > range_start_byte) { + --current_byte; + } } } else { - // Increment by byte + // Decrement by byte --current_byte; } @@ -66,8 +75,8 @@ text_iterator::value_type text_iterator::operator*() const { try { return utf8::peek_next(current_byte, range_end_byte); } catch(const std::exception& e) { - Error(LOCATION, "Exception while decoding UTF-8 sequence near '%.16s': %s", current_byte, e.what()); - return 0; + Warning(LOCATION, warning_text, "decoding UTF-8 sequence", current_byte, e.what()); + return replacement_char; } } else { // Use the unsigned byte value here to avoid integer overflows diff --git a/code/utils/unicode.h b/code/utils/unicode.h index 7bb0f85c1be..96e9f177201 100644 --- a/code/utils/unicode.h +++ b/code/utils/unicode.h @@ -33,6 +33,20 @@ namespace unicode { */ typedef char32_t codepoint_t; +/** + * @brief An invalid and ignorable character, equivalent to -1 + */ +constexpr codepoint_t invalid_char = static_cast(-1); + +/** + * @brief Substitute for malformed UTF-8 so that a bad decode can degrade gracefully + */ +constexpr codepoint_t replacement_char = 0xFFFD; + +constexpr codepoint_t comment_char = static_cast(COMMENT_CHAR); +constexpr codepoint_t eoln = static_cast(EOLN); +constexpr codepoint_t carriage_return = static_cast(CARRIAGE_RETURN); + class text_iterator { const char* current_byte = nullptr; const char* range_end_byte = nullptr;