Skip to content

Commit 127667d

Browse files
committed
Properly handle UTF-16 surrogates in JSON
The previous unescaping of JSON did not take UTF-16 surrogate pairs into consideration.
1 parent 312fdb6 commit 127667d

1 file changed

Lines changed: 50 additions & 6 deletions

File tree

src/dged/json.c

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,41 @@ static uint32_t codepoint_from_hex(uint8_t bytes[4]) {
7373
return nmbr;
7474
}
7575

76+
struct surrogate_state {
77+
bool low;
78+
uint16_t high_surrogate;
79+
80+
uint32_t codepoint;
81+
};
82+
83+
static bool parse_surrogate(struct surrogate_state *state, uint32_t codepoint) {
84+
if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
85+
// we have a surrogate pair
86+
if (!state->low) {
87+
// high surrogate
88+
state->high_surrogate = (codepoint - 0xd800) * 0x400;
89+
state->low = true;
90+
return false;
91+
} else {
92+
// low surrogate
93+
state->codepoint = codepoint - 0xdc00 + state->high_surrogate + 0x10000;
94+
state->low = false;
95+
return true;
96+
}
97+
}
98+
99+
// a non-surrogate codepoint
100+
state->codepoint = codepoint;
101+
return true;
102+
}
103+
76104
struct s8 unescape_json_string(struct s8 input) {
77105
size_t new_size = 0;
78106
bool escape = false;
107+
108+
struct surrogate_state state = {};
109+
memset(&state, 0, sizeof(struct surrogate_state));
110+
79111
for (size_t bi = 0; bi < input.l; ++bi) {
80112
uint8_t b = input.s[bi];
81113

@@ -88,16 +120,22 @@ struct s8 unescape_json_string(struct s8 input) {
88120
if (b == 'u' && escape) {
89121
// unicode codepoint, calculate byte-width
90122
// format is \uXXXX where X is a hex digit.
91-
uint8_t chars[4];
92123
uint32_t codepoint = codepoint_from_hex(&input.s[bi + 1]);
93-
sz = utf8_encode(codepoint, chars);
124+
if (parse_surrogate(&state, codepoint)) {
125+
uint8_t chars[4];
126+
sz = utf8_encode(state.codepoint, chars);
127+
} else {
128+
sz = 0;
129+
}
130+
94131
bi += 4;
95132
}
96133

97134
new_size += sz;
98135
escape = false;
99136
}
100137

138+
memset(&state, 0, sizeof(struct surrogate_state));
101139
escape = false;
102140
uint8_t *buf = calloc(new_size + 1, 1);
103141
size_t bufi = 0;
@@ -131,11 +169,17 @@ struct s8 unescape_json_string(struct s8 input) {
131169
buf[bufi] = '\t';
132170
break;
133171
case 'u': {
134-
uint8_t chars[4] = {0};
135172
uint32_t codepoint = codepoint_from_hex(&input.s[bi + 1]);
136-
size_t size = utf8_encode(codepoint, chars);
137-
memcpy(&buf[bufi], chars, size);
138-
skip = size;
173+
174+
if (parse_surrogate(&state, codepoint)) {
175+
uint8_t chars[4] = {0};
176+
size_t size = utf8_encode(state.codepoint, chars);
177+
memcpy(&buf[bufi], chars, size);
178+
skip = size;
179+
} else {
180+
skip = 0;
181+
}
182+
139183
bi += 4;
140184
} break;
141185
case '"':

0 commit comments

Comments
 (0)