Skip to content

Commit 243532e

Browse files
kwajiehaoCISC
andauthored
jinja : support ensure_ascii=true, string repetition and int/float self-filtering (#21623)
* feat: jinja engine improvements for reka-edge Port three Jinja engine improvements needed for the reka-edge model: 1. Python-style string repetition ("ab" * 3 → "ababab") 2. ensure_ascii=true support for tojson filter (escapes non-ASCII to \uXXXX) 3. int() builtin on value_int_t (identity, needed for Reka Edge template) * fix: escape invalid utf8 bytes when ensure_ascii=true The json_ensure_ascii_preserving_format function does not correctly handle an edge case where if UTF-8 parsing fails, it adds the non-ascii character back to the output as a raw byte. This commit fixes that by adding the unicode standard replacement character \\ufffd to the output instead. This is the standard behavior for various programming languages like Python, Rust, Go, etc. * chore: address PR comments 1. Add todo comment for supporting string repetition for array/tuples 2. Add support for float identity operation 3. Move invalid ascii test case to test_fuzzing * chore: accept suggestion for common/jinja/value.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent 5e9c635 commit 243532e

File tree

3 files changed

+160
-3
lines changed

3 files changed

+160
-3
lines changed

common/jinja/runtime.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,23 @@ value binary_expression::execute_impl(context & ctx) {
251251
return res;
252252
}
253253

254+
// Python-style string repetition
255+
// TODO: support array/tuple repetition (e.g., [1, 2] * 3 → [1, 2, 1, 2, 1, 2])
256+
if (op.value == "*" &&
257+
((is_val<value_string>(left_val) && is_val<value_int>(right_val)) ||
258+
(is_val<value_int>(left_val) && is_val<value_string>(right_val)))) {
259+
const auto & str = is_val<value_string>(left_val) ? left_val->as_string() : right_val->as_string();
260+
const int64_t repeat = is_val<value_int>(right_val) ? right_val->as_int() : left_val->as_int();
261+
auto res = mk_val<value_string>();
262+
if (repeat <= 0) {
263+
return res;
264+
}
265+
for (int64_t i = 0; i < repeat; ++i) {
266+
res->val_str = res->val_str.append(str);
267+
}
268+
return res;
269+
}
270+
254271
// String membership
255272
if (is_val<value_string>(left_val) && is_val<value_string>(right_val)) {
256273
// case: "a" in "abc"

common/jinja/value.cpp

Lines changed: 90 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "runtime.h"
2+
#include "unicode.h"
23
#include "value.h"
34

45
// for converting from JSON to jinja values
@@ -154,6 +155,83 @@ static value test_compare_fn(const func_args & args) {
154155
return mk_val<value_bool>(value_compare(args.get_pos(0), args.get_pos(1), op));
155156
}
156157

158+
static void append_codepoint_as_ascii_json_escape(std::string & out, uint32_t codepoint) {
159+
auto append_u16 = [&out](uint32_t value) {
160+
char buf[8];
161+
snprintf(buf, sizeof(buf), "\\u%04x", static_cast<unsigned int>(value));
162+
out += buf;
163+
};
164+
165+
if (codepoint <= 0xFFFF) {
166+
append_u16(codepoint);
167+
return;
168+
}
169+
170+
codepoint -= 0x10000;
171+
append_u16(0xD800 + ((codepoint >> 10) & 0x3FF));
172+
append_u16(0xDC00 + (codepoint & 0x3FF));
173+
}
174+
175+
static std::string json_ensure_ascii_preserving_format(const std::string & json_str) {
176+
std::string output;
177+
output.reserve(json_str.size());
178+
179+
bool in_string = false;
180+
bool escaped = false;
181+
182+
for (size_t pos = 0; pos < json_str.size();) {
183+
const char ch = json_str[pos];
184+
if (!in_string) {
185+
output.push_back(ch);
186+
if (ch == '"') {
187+
in_string = true;
188+
}
189+
++pos;
190+
continue;
191+
}
192+
193+
if (escaped) {
194+
output.push_back(ch);
195+
escaped = false;
196+
++pos;
197+
continue;
198+
}
199+
200+
if (ch == '\\') {
201+
output.push_back(ch);
202+
escaped = true;
203+
++pos;
204+
continue;
205+
}
206+
207+
if (ch == '"') {
208+
output.push_back(ch);
209+
in_string = false;
210+
++pos;
211+
continue;
212+
}
213+
214+
const unsigned char uch = static_cast<unsigned char>(ch);
215+
if (uch < 0x80) {
216+
output.push_back(ch);
217+
++pos;
218+
continue;
219+
}
220+
221+
auto parsed = common_parse_utf8_codepoint(json_str, pos);
222+
if (parsed.status != utf8_parse_result::SUCCESS) {
223+
output += "\\ufffd";
224+
++pos;
225+
continue;
226+
}
227+
228+
append_codepoint_as_ascii_json_escape(output, parsed.codepoint);
229+
pos += parsed.bytes_consumed;
230+
}
231+
232+
return output;
233+
}
234+
157235
static value tojson(const func_args & args) {
158236
args.ensure_count(1, 5);
159237
value val_ascii = args.get_kwarg_or_pos("ensure_ascii", 1);
@@ -169,16 +247,17 @@ static value tojson(const func_args & args) {
169247
if (is_val<value_int>(val_indent)) {
170248
indent = static_cast<int>(val_indent->as_int());
171249
}
172-
if (val_ascii->as_bool()) { // undefined == false
173-
throw not_implemented_exception("tojson ensure_ascii=true not implemented");
174-
}
175250
if (val_sort->as_bool()) { // undefined == false
176251
throw not_implemented_exception("tojson sort_keys=true not implemented");
177252
}
253+
const bool ensure_ascii = val_ascii->as_bool(); // undefined == false
178254
auto separators = (is_val<value_array>(val_separators) ? val_separators : mk_val<value_array>())->as_array();
179255
std::string item_sep = separators.size() > 0 ? separators[0]->as_string().str() : (indent < 0 ? ", " : ",");
180256
std::string key_sep = separators.size() > 1 ? separators[1]->as_string().str() : ": ";
181257
std::string json_str = value_to_json(args.get_pos(0), indent, item_sep, key_sep);
258+
if (ensure_ascii) {
259+
json_str = json_ensure_ascii_preserving_format(json_str);
260+
}
182261
return mk_val<value_string>(json_str);
183262
}
184263

@@ -460,6 +539,10 @@ const func_builtins & value_int_t::get_builtins() const {
460539
int64_t val = args.get_pos(0)->as_int();
461540
return mk_val<value_int>(val < 0 ? -val : val);
462541
}},
542+
{"int", [](const func_args & args) -> value {
543+
args.ensure_vals<value_int>();
544+
return mk_val<value_int>(args.get_pos(0)->as_int());
545+
}},
463546
{"float", [](const func_args & args) -> value {
464547
args.ensure_vals<value_int>();
465548
double val = static_cast<double>(args.get_pos(0)->as_int());
@@ -486,6 +569,10 @@ const func_builtins & value_float_t::get_builtins() const {
486569
int64_t val = static_cast<int64_t>(args.get_pos(0)->as_float());
487570
return mk_val<value_int>(val);
488571
}},
572+
{"float", [](const func_args & args) -> value {
573+
args.ensure_vals<value_float>();
574+
return mk_val<value_float>(args.get_pos(0)->as_float());
575+
}},
489576
{"safe", tojson},
490577
{"string", tojson},
491578
{"tojson", tojson},

tests/test-jinja.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,18 @@ static void test_expressions(testing & t) {
447447
"hello world"
448448
);
449449

450+
test_template(t, "string repetition",
451+
"{{ 'ab' * 3 }}",
452+
json::object(),
453+
"ababab"
454+
);
455+
456+
test_template(t, "reversed string repetition",
457+
"{{ 3 * 'ab' }}",
458+
json::object(),
459+
"ababab"
460+
);
461+
450462
test_template(t, "ternary",
451463
"{{ 'yes' if cond else 'no' }}",
452464
{{"cond", true}},
@@ -693,6 +705,33 @@ static void test_filters(testing & t) {
693705
"\"\\u2713\""
694706
);
695707

708+
test_template(t, "tojson ensure_ascii=true nested object",
709+
"{{ data|tojson(ensure_ascii=true) }}",
710+
{{"data", {
711+
{"text", "\u2713"},
712+
{"items", json::array({"é", {{"snowman", ""}}})}
713+
}}},
714+
"{\"text\": \"\\u2713\", \"items\": [\"\\u00e9\", {\"snowman\": \"\\u2603\"}]}"
715+
);
716+
717+
test_template(t, "tojson ensure_ascii=true indent=2",
718+
"{{ data|tojson(ensure_ascii=true, indent=2) }}",
719+
{{"data", {
720+
{"text", "\u2713"},
721+
{"nested", {{"accent", "é"}}}
722+
}}},
723+
"{\n \"text\": \"\\u2713\",\n \"nested\": {\n \"accent\": \"\\u00e9\"\n }\n}"
724+
);
725+
726+
test_template(t, "tojson ensure_ascii=true preserves existing escapes",
727+
"{{ data|tojson(ensure_ascii=true) }}",
728+
{{"data", {
729+
{"emoji", "😀"},
730+
{"line", "a\nb"}
731+
}}},
732+
"{\"emoji\": \"\\ud83d\\ude00\", \"line\": \"a\\nb\"}"
733+
);
734+
696735
test_template(t, "tojson sort_keys=true",
697736
"{{ data|tojson(sort_keys=true) }}",
698737
{{"data", {{"b", 2}, {"a", 1}}}},
@@ -771,6 +810,12 @@ static void test_filters(testing & t) {
771810
"hello"
772811
);
773812

813+
test_template(t, "int filter on integer is identity",
814+
"{{ value|int }}",
815+
{{"value", 7}},
816+
"7"
817+
);
818+
774819
test_template(t, "none to string",
775820
"{{ x|string }}",
776821
{{"x", nullptr}},
@@ -2458,4 +2503,12 @@ static void test_fuzzing(testing & t) {
24582503
t.assert_true("builtin " + type_name + "." + fn_name + " #" + std::to_string(i), fuzz_test_template(tmpl, vars));
24592504
}
24602505
});
2506+
2507+
t.test("tojson ensure_ascii=true with invalid utf-8", [&](testing & t) {
2508+
t.assert_true("invalid utf-8 does not crash",
2509+
fuzz_test_template(
2510+
"{{ data|tojson(ensure_ascii=true) }}",
2511+
{{"data", std::string("hello\xfe\xffworld")}}
2512+
));
2513+
});
24612514
}

0 commit comments

Comments
 (0)