Skip to content

Commit d15cadc

Browse files
authored
Merge pull request #7537 from Goober5000/fix/defensive_unicode
more graceful degradation of invalid unicode
2 parents 21b8035 + 43b4724 commit d15cadc

4 files changed

Lines changed: 34 additions & 11 deletions

File tree

code/globalincs/pstypes.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@
4747
#define DIR_SEPARATOR_STR "/"
4848
#endif
4949

50+
constexpr char COMMENT_CHAR = static_cast<char>(';');
51+
constexpr char EOLN = static_cast<char>(0x0a);
52+
constexpr char CARRIAGE_RETURN = static_cast<char>(0x0d);
53+
5054
#ifndef NDEBUG
5155
constexpr bool FSO_DEBUG = true;
5256
#else

code/parse/parselo.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@ extern int fred_parse_flag;
3434
extern int Token_found_flag;
3535

3636

37-
#define COMMENT_CHAR (char)';'
38-
#define EOLN (char)0x0a
39-
#define CARRIAGE_RETURN (char)0x0d
40-
4137
enum class LineEndingType { UNKNOWN, CR, CRLF, LF };
4238

4339
#define F_NAME 1

code/utils/unicode.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,20 @@ text_iterator::text_iterator(const char* in_current_byte, const char* in_range_s
1919
#endif
2020
}
2121
}
22+
23+
constexpr auto warning_text = "Exception while %s near '%.16s': %s\n\nThis is most likely caused by text created in another encoding, such as Windows-1252, that cannot be interpreted as UTF-8.";
24+
2225
text_iterator& unicode::text_iterator::operator++() {
2326
if (Unicode_text_mode) {
2427
try {
2528
// Increment by UTF-8 encoded codepoints
2629
utf8::next(current_byte, range_end_byte);
2730
} catch(const std::exception& e) {
28-
Error(LOCATION, "Exception while incrementing UTF-8 sequence near '%.16s': %s", current_byte, e.what());
29-
return *this;
31+
Warning(LOCATION, warning_text, "incrementing text iterator", current_byte, e.what());
32+
// Increment by byte, so we still make progress
33+
if (current_byte < range_end_byte) {
34+
++current_byte;
35+
}
3036
}
3137
} else {
3238
// Increment by byte
@@ -41,11 +47,14 @@ text_iterator& text_iterator::operator--() {
4147
// Decrement by UTF-8 encoded codepoints
4248
utf8::prior(current_byte, range_start_byte);
4349
} catch(const std::exception& e) {
44-
Error(LOCATION, "Exception while decrementing text iterator near '%.16s': %s", current_byte, e.what());
45-
return *this;
50+
Warning(LOCATION, warning_text, "decrementing text iterator", current_byte, e.what());
51+
// Decrement by byte, so we still make progress
52+
if (current_byte > range_start_byte) {
53+
--current_byte;
54+
}
4655
}
4756
} else {
48-
// Increment by byte
57+
// Decrement by byte
4958
--current_byte;
5059
}
5160

@@ -66,8 +75,8 @@ text_iterator::value_type text_iterator::operator*() const {
6675
try {
6776
return utf8::peek_next(current_byte, range_end_byte);
6877
} catch(const std::exception& e) {
69-
Error(LOCATION, "Exception while decoding UTF-8 sequence near '%.16s': %s", current_byte, e.what());
70-
return 0;
78+
Warning(LOCATION, warning_text, "decoding UTF-8 sequence", current_byte, e.what());
79+
return replacement_char;
7180
}
7281
} else {
7382
// Use the unsigned byte value here to avoid integer overflows

code/utils/unicode.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,20 @@ namespace unicode {
3333
*/
3434
typedef char32_t codepoint_t;
3535

36+
/**
37+
* @brief An invalid and ignorable character, equivalent to -1
38+
*/
39+
constexpr codepoint_t invalid_char = static_cast<codepoint_t>(-1);
40+
41+
/**
42+
* @brief Substitute for malformed UTF-8 so that a bad decode can degrade gracefully
43+
*/
44+
constexpr codepoint_t replacement_char = 0xFFFD;
45+
46+
constexpr codepoint_t comment_char = static_cast<codepoint_t>(COMMENT_CHAR);
47+
constexpr codepoint_t eoln = static_cast<codepoint_t>(EOLN);
48+
constexpr codepoint_t carriage_return = static_cast<codepoint_t>(CARRIAGE_RETURN);
49+
3650
class text_iterator {
3751
const char* current_byte = nullptr;
3852
const char* range_end_byte = nullptr;

0 commit comments

Comments
 (0)