Skip to content

Commit cd32f52

Browse files
committed
feat: properly fold in UTF16 surrogates
1 parent 69bf40a commit cd32f52

7 files changed

Lines changed: 395 additions & 222 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
9292

9393
add_library(json STATIC
9494
src/json.cpp
95+
src/string_reader.cpp
9596
include/json/json.hpp
9697
include/json/serdes.hpp)
9798
set_target_properties(json PROPERTIES

include/json/json.hpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,23 @@ namespace json {
3030

3131
template <typename Value>
3232
struct ordered_map {
33+
ordered_map() = default;
34+
ordered_map(ordered_map const&) = default;
35+
ordered_map& operator=(ordered_map const&) = default;
36+
ordered_map(ordered_map&&) = default;
37+
ordered_map& operator=(ordered_map&&) = default;
38+
39+
using tree_map_t = std::map<string, Value>;
40+
using tree_map_value_t = tree_map_t::value_type;
41+
42+
ordered_map(std::initializer_list<tree_map_value_t> tree_list)
43+
: tree_{tree_list} {
44+
order_.reserve(tree_list.size());
45+
for (auto const& [key, _] : tree_list) {
46+
order_.push_back(key);
47+
}
48+
}
49+
3350
std::map<string, Value> const& items() const noexcept { return tree_; }
3451
std::span<string const> keys() const noexcept { return order_; }
3552

@@ -118,7 +135,7 @@ namespace json {
118135
}
119136

120137
private:
121-
std::map<string, Value> tree_{};
138+
tree_map_t tree_{};
122139
std::vector<string> order_{};
123140
};
124141

src/json.cpp

Lines changed: 6 additions & 212 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
// Copyright (c) 2021 midnightBITS
22
// This code is licensed under MIT license (see LICENSE for details)
33

4+
#include <array>
45
#include <cassert>
56
#include <cmath>
67
#include <iostream>
78
#include <limits>
89
#include <memory>
10+
#include <print>
911
#include <stack>
1012
#include <json/json.hpp>
1113

@@ -84,17 +86,18 @@ namespace json {
8486
return from_json_impl(value, path);
8587
}
8688

89+
node read_string(string_view::iterator&,
90+
string_view::iterator const&,
91+
read_mode mode);
92+
8793
namespace {
8894
using uchar = unsigned char;
8995
using iterator = string_view::iterator;
9096

9197
void skip_ws(iterator&, iterator const&, read_mode mode);
92-
node read_string(iterator&, iterator const&, read_mode mode);
9398
node read_number(iterator&, iterator const&, read_mode mode);
9499
node read_keyword(iterator&, iterator const&, read_mode mode);
95100

96-
void encode(uint32_t ch, string& target);
97-
98101
void skip_ws(iterator& it, iterator const& end, read_mode mode) {
99102
if (mode == read_mode::strict) {
100103
while (it != end) {
@@ -438,154 +441,6 @@ namespace json {
438441
return val;
439442
}
440443

441-
unsigned hex_escape(iterator& it, iterator const& end) {
442-
// sanity check for read_string
443-
assert(*it == 'x');
444-
445-
++it;
446-
auto const result = hex_digit(it, end);
447-
if (result == INV_HEX) return INV_HEX_SQUARE;
448-
449-
auto const lower = hex_digit(it, end);
450-
if (lower == INV_HEX) return INV_HEX_SQUARE;
451-
return result * 16 + lower; // NOLINT(readability-magic-numbers)
452-
}
453-
454-
uint32_t unicode_escape(iterator& it, iterator const& end) {
455-
static constexpr auto max = std::numeric_limits<uint32_t>::max();
456-
// sanity check for read_string
457-
assert(*it == 'u');
458-
459-
++it;
460-
if (it == end) return max;
461-
if (*it == '{') {
462-
++it;
463-
uint32_t val = 0;
464-
while (it != end && *it != '}') {
465-
auto const dig = hex_digit(it, end);
466-
if (dig == INV_HEX) return max;
467-
auto const overflow_guard = val;
468-
val *= 16; // NOLINT(readability-magic-numbers)
469-
val += dig;
470-
if (overflow_guard > val) return max;
471-
}
472-
if (it != end) ++it;
473-
return val;
474-
}
475-
476-
uint32_t val = 0;
477-
for (int i = 0; i < 4; ++i) {
478-
auto const dig = hex_digit(it, end);
479-
if (dig == INV_HEX) return max;
480-
val *= 16; // NOLINT(readability-magic-numbers)
481-
val += dig;
482-
}
483-
return val;
484-
}
485-
486-
node read_string(iterator& it, iterator const& end, read_mode mode) {
487-
// sanity check for value_reader::read and
488-
// object_reader::read_object_key
489-
assert(it != end && (*it == '"' || *it == '\''));
490-
491-
if (mode == read_mode::strict && *it == '\'') return {};
492-
493-
auto tmplt = *it;
494-
++it;
495-
496-
string result{};
497-
bool in_string = true;
498-
bool in_escape = false;
499-
while (it != end && in_string) {
500-
if (in_escape) {
501-
switch (*it) {
502-
case '\r':
503-
if (mode == read_mode::strict) return {};
504-
++it;
505-
if (it != end && *it == '\n') break;
506-
--it;
507-
break;
508-
case '\n':
509-
if (mode == read_mode::strict) return {};
510-
++it;
511-
if (it != end && *it == '\r') break;
512-
--it;
513-
break;
514-
case 'b':
515-
result.push_back('\b');
516-
break;
517-
case 'f':
518-
result.push_back('\f');
519-
break;
520-
case 'n':
521-
result.push_back('\n');
522-
break;
523-
case 'r':
524-
result.push_back('\r');
525-
break;
526-
case 't':
527-
result.push_back('\t');
528-
break;
529-
case 'v':
530-
if (mode == read_mode::strict) return {};
531-
result.push_back('\v');
532-
break;
533-
case 'x': {
534-
if (mode == read_mode::strict) return {};
535-
auto const val = hex_escape(it, end);
536-
if (val > 255) // NOLINT(readability-magic-numbers)
537-
return {};
538-
result.push_back(static_cast<string::value_type>(
539-
static_cast<uchar>(val)));
540-
--it;
541-
break;
542-
}
543-
case 'u': {
544-
auto const val = unicode_escape(it, end);
545-
if (val == std::numeric_limits<uint32_t>::max())
546-
return {};
547-
encode(val, result);
548-
--it;
549-
break;
550-
}
551-
case '"':
552-
case '\\':
553-
case '/':
554-
result.push_back(*it);
555-
break;
556-
default:
557-
if (mode == read_mode::strict) return {};
558-
result.push_back(*it);
559-
break;
560-
}
561-
++it;
562-
in_escape = false;
563-
continue;
564-
}
565-
566-
if (*it == tmplt) {
567-
++it;
568-
in_string = false;
569-
continue;
570-
}
571-
572-
switch (*it) {
573-
case '\\':
574-
in_escape = true;
575-
break;
576-
default:
577-
if (mode == read_mode::strict &&
578-
static_cast<unsigned char>(*it) < CHAR_SPACE)
579-
return {};
580-
result.push_back(*it);
581-
}
582-
++it;
583-
}
584-
585-
if (mode == read_mode::strict && in_string) return {};
586-
return node{std::move(result)};
587-
}
588-
589444
node read_int(iterator& it,
590445
iterator const& end,
591446
unsigned base,
@@ -763,67 +618,6 @@ namespace json {
763618
return {};
764619
}
765620

766-
constexpr uchar firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
767-
0xF0, 0xF8, 0xFC};
768-
769-
enum : uint32_t {
770-
UNI_SUR_HIGH_START = 0xD800,
771-
UNI_SUR_HIGH_END = 0xDBFF,
772-
UNI_SUR_LOW_START = 0xDC00,
773-
UNI_SUR_LOW_END = 0xDFFF,
774-
UNI_REPLACEMENT_CHAR = 0x0000FFFD,
775-
UNI_MAX_BMP = 0x0000FFFF,
776-
UNI_MAX_UTF16 = 0x0010FFFF,
777-
UNI_MAX_LEGAL_UTF32 = 0x0010FFFF
778-
};
779-
780-
constexpr uint32_t byteMask = 0xBF;
781-
constexpr uint32_t byteMark = 0x80;
782-
783-
void encode(uint32_t ch, string& target) {
784-
unsigned short bytesToWrite = 0;
785-
786-
/* Figure out how many bytes the result will require */
787-
if (ch < 0x80u) // NOLINT
788-
bytesToWrite = 1;
789-
else if (ch < 0x800u) // NOLINT
790-
bytesToWrite = 2; // NOLINT
791-
else if (ch >= UNI_SUR_HIGH_START &&
792-
ch <= UNI_SUR_LOW_END) { // NOLINT
793-
bytesToWrite = 3; // NOLINT
794-
ch = UNI_REPLACEMENT_CHAR;
795-
} else if (ch < 0x10000u) // NOLINT
796-
bytesToWrite = 3; // NOLINT
797-
else if (ch <= UNI_MAX_LEGAL_UTF32)
798-
bytesToWrite = 4; // NOLINT
799-
else {
800-
bytesToWrite = 3; // NOLINT
801-
ch = UNI_REPLACEMENT_CHAR;
802-
}
803-
804-
uchar mid[4];
805-
uchar* midp = mid + sizeof(mid);
806-
switch (bytesToWrite) { /* note: everything falls through. */
807-
case 4: // NOLINT
808-
*--midp = static_cast<uchar>((ch | byteMark) & byteMask);
809-
ch >>= 6; // NOLINT
810-
[[fallthrough]];
811-
case 3:
812-
*--midp = static_cast<uchar>((ch | byteMark) & byteMask);
813-
ch >>= 6; // NOLINT
814-
[[fallthrough]];
815-
case 2:
816-
*--midp = static_cast<uchar>((ch | byteMark) & byteMask);
817-
ch >>= 6; // NOLINT
818-
[[fallthrough]];
819-
case 1:
820-
*--midp =
821-
static_cast<uchar>(ch | firstByteMark[bytesToWrite]);
822-
}
823-
for (int i = 0; i < bytesToWrite; ++i)
824-
target.push_back(static_cast<string::value_type>(*midp++));
825-
}
826-
827621
struct size_judge {
828622
size_t allowed_space;
829623

0 commit comments

Comments
 (0)