Skip to content

Commit cff5374

Browse files
committed
fast check for ascii
1 parent 251f868 commit cff5374

6 files changed

Lines changed: 250 additions & 70 deletions

File tree

include/jsoncons/json_error.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ namespace jsoncons {
4242
unexpected_rbracket,
4343
unexpected_rbrace,
4444
illegal_comment,
45-
expected_continuation_byte,
45+
bad_continuation_byte,
4646
over_long_utf8_sequence,
4747
illegal_codepoint,
4848
illegal_surrogate_value,
@@ -111,7 +111,7 @@ namespace jsoncons {
111111
return "Unexpected right bracket ']'";
112112
case json_errc::illegal_comment:
113113
return "Illegal comment";
114-
case json_errc::expected_continuation_byte:
114+
case json_errc::bad_continuation_byte:
115115
return "Expected continuation byte";
116116
case json_errc::over_long_utf8_sequence:
117117
return "Over long UTF-8 sequence";

include/jsoncons/json_parser.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2350,11 +2350,11 @@ class basic_json_parser : public ser_context
23502350
return;
23512351
}
23522352
break;
2353-
case unicode_traits::conv_errc::expected_continuation_byte:
2354-
more_ = err_handler_(json_errc::expected_continuation_byte, *this);
2353+
case unicode_traits::conv_errc::bad_continuation_byte:
2354+
more_ = err_handler_(json_errc::bad_continuation_byte, *this);
23552355
if (!more_)
23562356
{
2357-
ec = json_errc::expected_continuation_byte;
2357+
ec = json_errc::bad_continuation_byte;
23582358
return;
23592359
}
23602360
break;

include/jsoncons/utility/unicode_traits.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ namespace unicode_traits {
233233
{
234234
success = 0,
235235
over_long_utf8_sequence = 1, // over long utf8 sequence
236-
expected_continuation_byte, // expected continuation byte
236+
bad_continuation_byte, // expected continuation byte
237237
unpaired_high_surrogate, // unpaired high surrogate UTF-16
238238
illegal_surrogate_value, // UTF-16 surrogate values are illegal in UTF-32
239239
source_exhausted, // partial character in source, but hit end
@@ -254,7 +254,7 @@ namespace unicode_traits {
254254
{
255255
case conv_errc::over_long_utf8_sequence:
256256
return "Over long utf8 sequence";
257-
case conv_errc::expected_continuation_byte:
257+
case conv_errc::bad_continuation_byte:
258258
return "Expected continuation byte";
259259
case conv_errc::unpaired_high_surrogate:
260260
return "Unpaired high surrogate UTF-16";
@@ -311,15 +311,15 @@ namespace unicode_traits {
311311
return conv_errc::over_long_utf8_sequence;
312312
case 4:
313313
if (((byte = (*--end))& 0xC0) != 0x80)
314-
return conv_errc::expected_continuation_byte;
314+
return conv_errc::bad_continuation_byte;
315315
JSONCONS_FALLTHROUGH;
316316
case 3:
317317
if (((byte = (*--end))& 0xC0) != 0x80)
318-
return conv_errc::expected_continuation_byte;
318+
return conv_errc::bad_continuation_byte;
319319
JSONCONS_FALLTHROUGH;
320320
case 2:
321321
if (((byte = (*--end))& 0xC0) != 0x80)
322-
return conv_errc::expected_continuation_byte;
322+
return conv_errc::bad_continuation_byte;
323323

324324
switch (*it)
325325
{

test/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ add_executable(unit_tests
151151
corelib/src/utility/heap_string_tests.cpp
152152
corelib/src/utility/read_number_tests.cpp
153153
corelib/src/utility/string_utils_tests.cpp
154-
corelib/src/utility/unicode_conv_tests.cpp
154+
corelib/src/utility/unicode_traits_tests.cpp
155155
corelib/src/utility/uri_tests.cpp
156156
corelib/src/wjson_tests.cpp
157157
csv/src/csv_cursor_tests.cpp

test/corelib/src/utility/unicode_conv_tests.cpp

Lines changed: 0 additions & 59 deletions
This file was deleted.
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
// Copyright 2013-2026 Daniel Parker
2+
// Distributed under Boost license
3+
4+
#include <jsoncons/json.hpp>
5+
#include <sstream>
6+
#include <vector>
7+
#include <utility>
8+
#include <ctime>
9+
#include <string>
10+
#include <catch/catch.hpp>
11+
12+
using namespace jsoncons;
13+
14+
TEST_CASE("unicode_traits tests")
15+
{
16+
SECTION("test_surrogate_pair")
17+
{
18+
std::string input = "[\"\\u8A73\\u7D30\\u95B2\\u89A7\\uD800\\uDC01\\u4E00\"]";
19+
json value = json::parse(input);
20+
auto options = json_options{}
21+
.escape_all_non_ascii(true);
22+
std::string output;
23+
value.dump(output,options);
24+
25+
CHECK(input == output);
26+
}
27+
SECTION("test_wide_surrogate_pair")
28+
{
29+
std::wstring input = L"[\"\\u8A73\\u7D30\\u95B2\\u89A7\\uD800\\uDC01\\u4E00\"]";
30+
wjson value = wjson::parse(input);
31+
auto options = wjson_options{}
32+
.escape_all_non_ascii(true);
33+
std::wstring output;
34+
value.dump(output,options);
35+
36+
CHECK(input == output);
37+
}
38+
SECTION("with double")
39+
{
40+
std::istringstream is("{\"unicode_string_1\":\"\\uD800\\uDC00\"}");
41+
42+
json root = json::parse(is);
43+
CHECK(root.is_object());
44+
CHECK(root.is_object());
45+
46+
root["double_1"] = 10.0;
47+
48+
json double_1 = root["double_1"];
49+
50+
CHECK(10.0 == Approx(double_1.as<double>()).epsilon(0.000001));
51+
52+
CHECK(10.0 == Approx(double_1.as<double>()).epsilon(0.000001));
53+
}
54+
}
55+
56+
TEST_CASE("unicode_traits utf8 tests")
57+
{
58+
SECTION("Valid continuation bytes")
59+
{
60+
std::vector<uint8_t> seq1 = {0xC2, 0x80};
61+
auto result1 = unicode_traits::validate(seq1.data(), seq1.size());
62+
CHECK(unicode_traits::conv_errc::success == result1.ec);
63+
64+
std::vector<uint8_t> seq2 = {0xC2, 0xBF};
65+
auto result2 = unicode_traits::validate(seq1.data(), seq1.size());
66+
CHECK(unicode_traits::conv_errc::success == result2.ec);
67+
68+
std::vector<uint8_t> seq3 = {0xDF, 0xBF};
69+
auto result3 = unicode_traits::validate(seq1.data(), seq1.size());
70+
CHECK(unicode_traits::conv_errc::success == result3.ec);
71+
72+
std::vector<uint8_t> seq4 = {0xE0, 0xA0, 0x80};
73+
auto result4 = unicode_traits::validate(seq1.data(), seq1.size());
74+
CHECK(unicode_traits::conv_errc::success == result4.ec);
75+
76+
std::vector<uint8_t> seq5 = {0xEF, 0xBF, 0xBF};
77+
auto result5 = unicode_traits::validate(seq1.data(), seq1.size());
78+
CHECK(unicode_traits::conv_errc::success == result5.ec);
79+
80+
std::vector<uint8_t> seq6 = {0xF0, 0x90, 0x80, 0x80};
81+
auto result6 = unicode_traits::validate(seq1.data(), seq1.size());
82+
CHECK(unicode_traits::conv_errc::success == result6.ec);
83+
84+
std::vector<uint8_t> seq7 = {0xF4, 0x8F, 0xBF, 0xBF};
85+
auto result7 = unicode_traits::validate(seq1.data(), seq1.size());
86+
CHECK(unicode_traits::conv_errc::success == result7.ec);
87+
}
88+
SECTION("Boundary tests")
89+
{
90+
std::vector<uint8_t> seq1 = {0xC2, 0x80};
91+
auto result1 = unicode_traits::validate(seq1.data(), seq1.size());
92+
CHECK(unicode_traits::conv_errc::success == result1.ec);
93+
94+
std::vector<uint8_t> seq2 = {0xC2, 0xBF};
95+
auto result2 = unicode_traits::validate(seq1.data(), seq1.size());
96+
CHECK(unicode_traits::conv_errc::success == result2.ec);
97+
98+
std::vector<uint8_t> seq3 = {0xE0, 0xA0, 0x80};
99+
auto result3 = unicode_traits::validate(seq1.data(), seq1.size());
100+
CHECK(unicode_traits::conv_errc::success == result3.ec);
101+
102+
std::vector<uint8_t> seq4 = {0xED, 0x9F, 0xBF};
103+
auto result4 = unicode_traits::validate(seq1.data(), seq1.size());
104+
CHECK(unicode_traits::conv_errc::success == result4.ec);
105+
106+
std::vector<uint8_t> seq5 = {0xF0, 0x90, 0x80, 0x80};
107+
auto result5 = unicode_traits::validate(seq1.data(), seq1.size());
108+
CHECK(unicode_traits::conv_errc::success == result5.ec);
109+
110+
std::vector<uint8_t> seq6 = {0xF4, 0x8F, 0xBF, 0xBF};
111+
auto result6 = unicode_traits::validate(seq1.data(), seq1.size());
112+
CHECK(unicode_traits::conv_errc::success == result6.ec);
113+
}
114+
SECTION("Invalid isolated bytes")
115+
{
116+
std::vector<uint8_t> seq1 = {0x80};
117+
auto result1 = unicode_traits::validate(seq1.data(), seq1.size());
118+
CHECK(unicode_traits::conv_errc::source_illegal == result1.ec);
119+
120+
std::vector<uint8_t> seq2 = {0x81};
121+
auto result2 = unicode_traits::validate(seq1.data(), seq1.size());
122+
CHECK(unicode_traits::conv_errc::source_illegal == result2.ec);
123+
124+
std::vector<uint8_t> seq3 = {0x90};
125+
auto result3 = unicode_traits::validate(seq1.data(), seq1.size());
126+
CHECK(unicode_traits::conv_errc::source_illegal == result3.ec);
127+
128+
std::vector<uint8_t> seq4 = {0xA0};
129+
auto result4 = unicode_traits::validate(seq1.data(), seq1.size());
130+
CHECK(unicode_traits::conv_errc::source_illegal == result4.ec);
131+
132+
std::vector<uint8_t> seq5 = {0xBF};
133+
auto result5 = unicode_traits::validate(seq1.data(), seq1.size());
134+
CHECK(unicode_traits::conv_errc::source_illegal == result5.ec);
135+
136+
std::vector<uint8_t> seq6 = {0xC0};
137+
auto result6 = unicode_traits::validate(seq1.data(), seq1.size());
138+
CHECK(unicode_traits::conv_errc::source_illegal == result6.ec);
139+
140+
std::vector<uint8_t> seq7 = {0xC1};
141+
auto result7 = unicode_traits::validate(seq1.data(), seq1.size());
142+
CHECK(unicode_traits::conv_errc::source_illegal == result7.ec);
143+
144+
for (int c = 0xF5; c <= int(0xFF); ++c)
145+
{
146+
std::vector<uint8_t> seq = {uint8_t(c)};
147+
auto result = unicode_traits::validate(seq1.data(), seq1.size());
148+
CHECK(unicode_traits::conv_errc::source_illegal == result.ec);
149+
}
150+
}
151+
SECTION("Invalid multiple continuation bytes without a starter")
152+
{
153+
std::vector<uint8_t> seq1 = {0x80, 0x80};
154+
auto result1 = unicode_traits::validate(seq1.data(), seq1.size());
155+
CHECK(unicode_traits::conv_errc::source_illegal == result1.ec);
156+
157+
std::vector<uint8_t> seq2 = {0xBF, 0xBF};
158+
auto result2 = unicode_traits::validate(seq1.data(), seq1.size());
159+
CHECK(unicode_traits::conv_errc::source_illegal == result2.ec);
160+
161+
std::vector<uint8_t> seq3 = {0x80, 0xBF, 0x80};
162+
auto result3 = unicode_traits::validate(seq1.data(), seq1.size());
163+
CHECK(unicode_traits::conv_errc::source_illegal == result3.ec);
164+
165+
std::vector<uint8_t> seq4 = {0x80, 0x81, 0x82, 0x83};
166+
auto result4 = unicode_traits::validate(seq1.data(), seq1.size());
167+
CHECK(unicode_traits::conv_errc::source_illegal == result4.ec);
168+
}
169+
SECTION("Invalid continuation after ASCII")
170+
{
171+
std::vector<uint8_t> seq1 = {0x41, 0x80};
172+
auto result1 = unicode_traits::validate(seq1.data(), seq1.size());
173+
CHECK(unicode_traits::conv_errc::source_illegal == result1.ec);
174+
175+
std::vector<uint8_t> seq2 = {0x7F, 0xBF};
176+
auto result2 = unicode_traits::validate(seq1.data(), seq1.size());
177+
CHECK(unicode_traits::conv_errc::source_illegal == result2.ec);
178+
179+
std::vector<uint8_t> seq3 = {0x20, 0x80};
180+
auto result3 = unicode_traits::validate(seq1.data(), seq1.size());
181+
CHECK(unicode_traits::conv_errc::source_illegal == result3.ec);
182+
}
183+
SECTION("Invalid too few continuation bytes")
184+
{
185+
std::vector<uint8_t> seq1 = {0xC2};
186+
auto result1 = unicode_traits::validate(seq1.data(), seq1.size());
187+
CHECK(unicode_traits::conv_errc::source_exhausted == result1.ec);
188+
189+
std::vector<uint8_t> seq2 = {0xE2, 0x82};
190+
auto result2 = unicode_traits::validate(seq1.data(), seq1.size());
191+
CHECK(unicode_traits::conv_errc::source_exhausted == result2.ec);
192+
193+
std::vector<uint8_t> seq3 = {0xF0, 0x90, 0x80};
194+
auto result3 = unicode_traits::validate(seq1.data(), seq1.size());
195+
CHECK(unicode_traits::conv_errc::source_exhausted == result3.ec);
196+
}
197+
SECTION("Invalid: wrong continuation byte")
198+
{
199+
std::vector<uint8_t> seq1 = {0xC2, 0x7F};
200+
auto result1 = unicode_traits::validate(seq1.data(), seq1.size());
201+
CHECK(unicode_traits::conv_errc::bad_continuation_byte == result1.ec);
202+
203+
std::vector<uint8_t> seq2 = {0xC2, 0xC0};
204+
auto result2 = unicode_traits::validate(seq1.data(), seq1.size());
205+
CHECK(unicode_traits::conv_errc::bad_continuation_byte == result2.ec);
206+
207+
std::vector<uint8_t> seq3 = {0xE2, 0x28, 0xA1};
208+
auto result3 = unicode_traits::validate(seq1.data(), seq1.size());
209+
CHECK(unicode_traits::conv_errc::bad_continuation_byte == result3.ec);
210+
211+
std::vector<uint8_t> seq4 = {0xE2, 0xC0, 0x80};
212+
auto result4 = unicode_traits::validate(seq1.data(), seq1.size());
213+
CHECK(unicode_traits::conv_errc::bad_continuation_byte == result4.ec);
214+
215+
std::vector<uint8_t> seq5 = {0xF0, 0x90, 0x41, 0x80};
216+
auto result5 = unicode_traits::validate(seq1.data(), seq1.size());
217+
CHECK(unicode_traits::conv_errc::bad_continuation_byte == result5.ec);
218+
219+
std::vector<uint8_t> seq6 = {0xF0, 0x7F, 0x80, 0x80};
220+
auto result6 = unicode_traits::validate(seq1.data(), seq1.size());
221+
CHECK(unicode_traits::conv_errc::bad_continuation_byte == result6.ec);
222+
}
223+
SECTION("Invalid: extra continuation byte")
224+
{
225+
std::vector<uint8_t> seq1 = {0xC2, 0x80, 0x80};
226+
auto result1 = unicode_traits::validate(seq1.data(), seq1.size());
227+
CHECK(unicode_traits::conv_errc::source_illegal == result1.ec);
228+
229+
std::vector<uint8_t> seq2 = {0xE2, 0x82, 0xAC, 0x80};
230+
auto result2 = unicode_traits::validate(seq1.data(), seq1.size());
231+
CHECK(unicode_traits::conv_errc::source_illegal == result2.ec);
232+
233+
std::vector<uint8_t> seq3 = {0xF0, 0x90, 0x80, 0x80};
234+
auto result3 = unicode_traits::validate(seq1.data(), seq1.size());
235+
CHECK(unicode_traits::conv_errc::source_illegal == result3.ec);
236+
}
237+
}
238+
239+

0 commit comments

Comments
 (0)