Skip to content

Commit 8e9b1ae

Browse files
committed
more graceful degradation of invalid unicode
If invalid UTF-8 is encountered, return the standard Unicode replacement character 0xFFFD. When incrementing or decrementing, make a best effort to follow the code's instructions. It might produce one or more � characters, but that's better than an Error and a hard crash. Also move character constants to standardized locations.
1 parent e71e949 commit 8e9b1ae

4 files changed

Lines changed: 31 additions & 11 deletions

File tree

code/globalincs/pstypes.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@
4747
#define DIR_SEPARATOR_STR "/"
4848
#endif
4949

50+
constexpr char COMMENT_CHAR = static_cast<char>(';');
51+
constexpr char EOLN = static_cast<char>(0x0a);
52+
constexpr char CARRIAGE_RETURN = static_cast<char>(0x0d);
53+
5054
#ifndef NDEBUG
5155
constexpr bool FSO_DEBUG = true;
5256
#else

code/parse/parselo.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@ extern int fred_parse_flag;
3434
extern int Token_found_flag;
3535

3636

37-
#define COMMENT_CHAR (char)';'
38-
#define EOLN (char)0x0a
39-
#define CARRIAGE_RETURN (char)0x0d
40-
4137
enum class LineEndingType { UNKNOWN, CR, CRLF, LF };
4238

4339
#define F_NAME 1

code/utils/unicode.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,11 @@ text_iterator& unicode::text_iterator::operator++() {
2525
// Increment by UTF-8 encoded codepoints
2626
utf8::next(current_byte, range_end_byte);
2727
} catch(const std::exception& e) {
28-
Error(LOCATION, "Exception while incrementing UTF-8 sequence near '%.16s': %s", current_byte, e.what());
29-
return *this;
28+
Warning(LOCATION, "Exception while incrementing UTF-8 sequence near '%.16s': %s", current_byte, e.what());
29+
// Increment by byte, so we still make progress
30+
if (current_byte < range_end_byte) {
31+
++current_byte;
32+
}
3033
}
3134
} else {
3235
// Increment by byte
@@ -41,11 +44,14 @@ text_iterator& text_iterator::operator--() {
4144
// Decrement by UTF-8 encoded codepoints
4245
utf8::prior(current_byte, range_start_byte);
4346
} catch(const std::exception& e) {
44-
Error(LOCATION, "Exception while decrementing text iterator near '%.16s': %s", current_byte, e.what());
45-
return *this;
47+
Warning(LOCATION, "Exception while decrementing text iterator near '%.16s': %s", current_byte, e.what());
48+
// Decrement by byte, so we still make progress
49+
if (current_byte > range_start_byte) {
50+
--current_byte;
51+
}
4652
}
4753
} else {
48-
// Increment by byte
54+
// Decrement by byte
4955
--current_byte;
5056
}
5157

@@ -66,8 +72,8 @@ text_iterator::value_type text_iterator::operator*() const {
6672
try {
6773
return utf8::peek_next(current_byte, range_end_byte);
6874
} catch(const std::exception& e) {
69-
Error(LOCATION, "Exception while decoding UTF-8 sequence near '%.16s': %s", current_byte, e.what());
70-
return 0;
75+
Warning(LOCATION, "Exception while decoding UTF-8 sequence near '%.16s': %s", current_byte, e.what());
76+
return replacement_char;
7177
}
7278
} else {
7379
// Use the unsigned byte value here to avoid integer overflows

code/utils/unicode.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,20 @@ namespace unicode {
3333
*/
3434
typedef char32_t codepoint_t;
3535

36+
/**
37+
* @brief An invalid and ignorable character, equivalent to -1
38+
*/
39+
constexpr codepoint_t invalid_char = static_cast<codepoint_t>(-1);
40+
41+
/**
42+
* @brief Substitute for malformed UTF-8 so that a bad decode can degrade gracefully
43+
*/
44+
constexpr codepoint_t replacement_char = 0xFFFD;
45+
46+
constexpr codepoint_t comment_char = static_cast<codepoint_t>(COMMENT_CHAR);
47+
constexpr codepoint_t eoln = static_cast<codepoint_t>(EOLN);
48+
constexpr codepoint_t carriage_return = static_cast<codepoint_t>(CARRIAGE_RETURN);
49+
3650
class text_iterator {
3751
const char* current_byte = nullptr;
3852
const char* range_end_byte = nullptr;

0 commit comments

Comments
 (0)