@@ -73,9 +73,41 @@ static uint32_t codepoint_from_hex(uint8_t bytes[4]) {
7373 return nmbr ;
7474}
7575
76+ struct surrogate_state {
77+ bool low ;
78+ uint16_t high_surrogate ;
79+
80+ uint32_t codepoint ;
81+ };
82+
83+ static bool parse_surrogate (struct surrogate_state * state , uint32_t codepoint ) {
84+ if (codepoint >= 0xd800 && codepoint <= 0xdfff ) {
85+ // we have a surrogate pair
86+ if (!state -> low ) {
87+ // high surrogate
88+ state -> high_surrogate = (codepoint - 0xd800 ) * 0x400 ;
89+ state -> low = true;
90+ return false;
91+ } else {
92+ // low surrogate
93+ state -> codepoint = codepoint - 0xdc00 + state -> high_surrogate + 0x10000 ;
94+ state -> low = false;
95+ return true;
96+ }
97+ }
98+
99+ // a non-surrogate codepoint
100+ state -> codepoint = codepoint ;
101+ return true;
102+ }
103+
76104struct s8 unescape_json_string (struct s8 input ) {
77105 size_t new_size = 0 ;
78106 bool escape = false;
107+
108+ struct surrogate_state state = {};
109+ memset (& state , 0 , sizeof (struct surrogate_state ));
110+
79111 for (size_t bi = 0 ; bi < input .l ; ++ bi ) {
80112 uint8_t b = input .s [bi ];
81113
@@ -88,16 +120,22 @@ struct s8 unescape_json_string(struct s8 input) {
88120 if (b == 'u' && escape ) {
89121 // unicode codepoint, calculate byte-width
90122 // format is \uXXXX where X is a hex digit.
91- uint8_t chars [4 ];
92123 uint32_t codepoint = codepoint_from_hex (& input .s [bi + 1 ]);
93- sz = utf8_encode (codepoint , chars );
124+ if (parse_surrogate (& state , codepoint )) {
125+ uint8_t chars [4 ];
126+ sz = utf8_encode (state .codepoint , chars );
127+ } else {
128+ sz = 0 ;
129+ }
130+
94131 bi += 4 ;
95132 }
96133
97134 new_size += sz ;
98135 escape = false;
99136 }
100137
138+ memset (& state , 0 , sizeof (struct surrogate_state ));
101139 escape = false;
102140 uint8_t * buf = calloc (new_size + 1 , 1 );
103141 size_t bufi = 0 ;
@@ -131,11 +169,17 @@ struct s8 unescape_json_string(struct s8 input) {
131169 buf [bufi ] = '\t' ;
132170 break ;
133171 case 'u' : {
134- uint8_t chars [4 ] = {0 };
135172 uint32_t codepoint = codepoint_from_hex (& input .s [bi + 1 ]);
136- size_t size = utf8_encode (codepoint , chars );
137- memcpy (& buf [bufi ], chars , size );
138- skip = size ;
173+
174+ if (parse_surrogate (& state , codepoint )) {
175+ uint8_t chars [4 ] = {0 };
176+ size_t size = utf8_encode (state .codepoint , chars );
177+ memcpy (& buf [bufi ], chars , size );
178+ skip = size ;
179+ } else {
180+ skip = 0 ;
181+ }
182+
139183 bi += 4 ;
140184 } break ;
141185 case '"' :
0 commit comments