Skip to content

Commit 3dadc88

Browse files
authored
common : fix Step-3.5-Flash format detection and thinking support (ggml-org#19635)
* common : fix Step-3.5-Flash format detection and thinking support Step-3.5-Flash uses the same XML-style tool call format as Qwen3-Coder (<tool_call><function=...><parameter=...>) but its Jinja template lacks the bare <function> and plural <parameters> markers that the detection logic previously required. This caused it to fall through to Hermes 2 Pro, which doesn't call func_args_not_string(), so arguments stayed as JSON strings and templates using arguments|items crashed. Additionally, the Qwen3-Coder-XML format handler had no thinking support. Models like Step-3.5-Flash that unconditionally emit <think> in their generation prompt need the same thinking_forced_open handling that Nemotron v3 and Hermes 2 Pro already have, otherwise reasoning_content is never separated from content in API responses. Changes: - Relax Qwen3-Coder XML detection to only require the 3 shared markers - Tighten Nemotron v3 branch to also require bare <function> and plural <parameters>, preventing Step-3.5-Flash from being misrouted via <think> - Add thinking_forced_open support to Qwen3-Coder-XML init function - Add <think>/</think> to preserved tokens - Fix build_grammar_xml_tool_call to handle thinking_forced_open in the grammar root rule, allowing </think> before tool calls - Add Step-3.5-Flash chat template and format detection test Builds on: ggml-org#19283 * chat : route Step-3.5-Flash to Nemotron v3 PEG parser, add tests Step-3.5-Flash uses the same XML tool call format as Qwen3-Coder and Nemotron 3 Nano (<tool_call>/<function=...>/<parameter=...>) but with unconditional <think> output. Route it to the Nemotron v3 PEG parser for streaming and schema-aware parameter parsing. Detection: templates with <think> + XML tool tags use Nemotron v3 PEG parser; templates without <think> (Qwen3-Coder) use GBNF grammar. Tests cover: basic messages, tool calls with/without thinking content, parallel tool calls, code string parameters, optional </parameter> closing tags, and JSON schema response format. * chat : remove dead thinking code from qwen3_coder_xml Remove thinking handling code that became unreachable after routing Step-3.5-Flash to the Nemotron v3 PEG parser. Qwen3-Coder has no <think> in its template, so the thinking_forced_open logic, preserved tokens, and grammar prefix were dead paths.
1 parent 39e4b1d commit 3dadc88

3 files changed

Lines changed: 297 additions & 5 deletions

File tree

common/chat.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3141,15 +3141,15 @@ static common_chat_params common_chat_templates_apply_jinja(
31413141
}
31423142

31433143
// Qwen3-Coder XML format detection (must come before Hermes 2 Pro)
3144-
// Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates.
3145-
// Require presence of <tool_call>, <function=...>, and <parameter=...> blocks.
3144+
// Detect via XML markers: <tool_call>, <function=...>, and <parameter=...> blocks.
3145+
// Also matches Step-3.5-Flash and Nemotron 3 Nano which use the same output format.
31463146
if (src.find("<tool_call>") != std::string::npos &&
3147-
src.find("<function>") != std::string::npos &&
31483147
src.find("<function=") != std::string::npos &&
3149-
src.find("<parameters>") != std::string::npos &&
31503148
src.find("<parameter=") != std::string::npos) {
31513149
workaround::func_args_not_string(params.messages);
3152-
// Nemotron 3 Nano 30B A3B
3150+
// Models with <think> support (Step-3.5-Flash, Nemotron 3 Nano) use the
3151+
// Nemotron v3 PEG parser for streaming and schema-aware parameter parsing.
3152+
// Qwen3-Coder has no <think> in its template.
31533153
if (src.find("<think>") != std::string::npos) {
31543154
return common_chat_params_init_nemotron_v3(tmpl, params);
31553155
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
{% macro render_content(content) %}{% if content is none %}{{- '' }}{% elif content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %}{% endmacro %}
2+
{{bos_token}}{%- if tools %}
3+
{{- '<|im_start|>system\n' }}
4+
{%- if messages[0].role == 'system' %}
5+
{{- render_content(messages[0].content) + '\n\n' }}
6+
{%- endif %}
7+
{{- "# Tools\n\nYou have access to the following functions in JSONSchema format:\n\n<tools>" }}
8+
{%- for tool in tools %}
9+
{{- "\n" }}
10+
{{- tool | tojson(ensure_ascii=False) }}
11+
{%- endfor %}
12+
{{- "\n</tools>\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...>\n...\n</function> block must be nested within <tool_call>\n...\n</tool_call> XML tags\n- Required parameters MUST be specified\n</IMPORTANT><|im_end|>\n" }}
13+
{%- else %}
14+
{%- if messages[0].role == 'system' %}
15+
{{- '<|im_start|>system\n' + render_content(messages[0].content) + '<|im_end|>\n' }}
16+
{%- endif %}
17+
{%- endif %}
18+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
19+
{%- for message in messages[::-1] %}
20+
{%- set index = (messages|length - 1) - loop.index0 %}
21+
{%- if ns.multi_step_tool and message.role == "user" and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) %}
22+
{%- set ns.multi_step_tool = false %}
23+
{%- set ns.last_query_index = index %}
24+
{%- endif %}
25+
{%- endfor %}
26+
{%- for message in messages %}
27+
{%- set content = render_content(message.content) %}
28+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
29+
{%- set role_name = 'observation' if (message.role == "system" and not loop.first and message.name == 'observation') else message.role %}
30+
{{- '<|im_start|>' + role_name + '\n' + content + '<|im_end|>' + '\n' }}
31+
{%- elif message.role == "assistant" %}
32+
{%- if message.reasoning_content is string %}
33+
{%- set reasoning_content = render_content(message.reasoning_content) %}
34+
{%- else %}
35+
{%- if '</think>' in content %}
36+
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37+
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
38+
{%- else %}
39+
{%- set reasoning_content = '' %}
40+
{%- endif %}
41+
{%- endif %}
42+
{%- if loop.index0 > ns.last_query_index %}
43+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n' + content }}
44+
{%- else %}
45+
{{- '<|im_start|>' + message.role + '\n' + content }}
46+
{%- endif %}
47+
{%- if message.tool_calls %}
48+
{%- for tool_call in message.tool_calls %}
49+
{%- if tool_call.function is defined %}
50+
{%- set tool_call = tool_call.function %}
51+
{%- endif %}
52+
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
53+
{%- if tool_call.arguments is defined %}
54+
{%- set arguments = tool_call.arguments %}
55+
{%- for args_name, args_value in arguments|items %}
56+
{{- '<parameter=' + args_name + '>\n' }}
57+
{%- set args_value = args_value | tojson(ensure_ascii=False) | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
58+
{{- args_value }}
59+
{{- '\n</parameter>\n' }}
60+
{%- endfor %}
61+
{%- endif %}
62+
{{- '</function>\n</tool_call>' }}
63+
{%- endfor %}
64+
{%- endif %}
65+
{{- '<|im_end|>\n' }}
66+
{%- elif message.role == "tool" %}
67+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
68+
{{- '<|im_start|>tool_response\n' }}
69+
{%- endif %}
70+
{{- '<tool_response>' }}
71+
{{- content }}
72+
{{- '</tool_response>' }}
73+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
74+
{{- '<|im_end|>\n' }}
75+
{%- endif %}
76+
{%- endif %}
77+
{%- endfor %}
78+
{%- if add_generation_prompt %}
79+
{{- '<|im_start|>assistant\n<think>\n' }}
80+
{%- endif %}

tests/test-chat.cpp

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3553,6 +3553,28 @@ Hey there!<|im_end|>
35533553
auto grammar = build_grammar(params.grammar);
35543554
GGML_ASSERT(grammar && "Failed to build Qwen3-Coder grammar with union types");
35553555
}
3556+
3557+
{
3558+
// Step-3.5-Flash template: uses same XML output format as Qwen3-Coder and Nemotron v3,
3559+
// but with <think> support. Routes to the Nemotron v3 PEG parser for streaming and
3560+
// schema-aware parameter parsing.
3561+
auto tmpls = read_templates("models/templates/stepfun-ai-Step-3.5-Flash.jinja");
3562+
assert_equals(COMMON_CHAT_FORMAT_PEG_CONSTRUCTED, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
3563+
3564+
// Grammar and PEG parser should be generated with thinking_forced_open
3565+
{
3566+
common_chat_templates_inputs inputs;
3567+
inputs.messages = { message_user };
3568+
inputs.tools = { special_function_tool };
3569+
auto params = common_chat_templates_apply(tmpls.get(), inputs);
3570+
assert_equals(COMMON_CHAT_FORMAT_PEG_CONSTRUCTED, params.format);
3571+
assert_equals(true, params.thinking_forced_open);
3572+
assert_equals(false, params.grammar.empty());
3573+
assert_equals(false, params.parser.empty());
3574+
auto grammar = build_grammar(params.grammar);
3575+
GGML_ASSERT(grammar && "Failed to build Step-3.5-Flash grammar");
3576+
}
3577+
}
35563578
}
35573579

35583580
static void test_template_output_peg_parsers() {
@@ -3799,6 +3821,196 @@ static void test_template_output_peg_parsers() {
37993821
});
38003822
}
38013823

3824+
{
3825+
// Step-3.5-Flash (uses Nemotron v3 PEG parser with thinking_forced_open)
3826+
// Unlike Nemotron, Step-3.5-Flash always emits <think> regardless of enable_thinking,
3827+
// so all inputs must include a </think> delimiter.
3828+
auto tmpls = read_templates("models/templates/stepfun-ai-Step-3.5-Flash.jinja");
3829+
3830+
// Test basic message with reasoning
3831+
test_peg_parser(tmpls.get(), [&](auto & t) {
3832+
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
3833+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
3834+
3835+
t.expect = message_assist_thoughts;
3836+
});
3837+
3838+
// Test basic message without thinking content
3839+
test_peg_parser(tmpls.get(), [&](auto & t) {
3840+
t.input = "</think>\nHello, world!\nWhat's up?";
3841+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
3842+
3843+
t.expect = message_assist;
3844+
});
3845+
3846+
// Test tool call without thinking content
3847+
test_peg_parser(tmpls.get(), [&](auto & t) {
3848+
t.input =
3849+
"</think>\n"
3850+
"<tool_call>\n"
3851+
"<function=special_function>\n"
3852+
"<parameter=arg1>\n"
3853+
"1\n"
3854+
"</parameter>\n"
3855+
"</function>\n"
3856+
"</tool_call>";
3857+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
3858+
t.params.tools = {special_function_tool};
3859+
3860+
t.expect = message_assist_call;
3861+
});
3862+
3863+
// Test tool call with thinking
3864+
test_peg_parser(tmpls.get(), [&](auto & t) {
3865+
t.input =
3866+
"I'm\nthinking\n</think>\n"
3867+
"<tool_call>\n"
3868+
"<function=special_function>\n"
3869+
"<parameter=arg1>\n"
3870+
"1\n"
3871+
"</parameter>\n"
3872+
"</function>\n"
3873+
"</tool_call>";
3874+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
3875+
t.params.tools = {special_function_tool};
3876+
3877+
t.expect = message_assist_call_thoughts;
3878+
});
3879+
3880+
// Test parallel tool calls with thinking
3881+
test_peg_parser(tmpls.get(), [&](auto & t) {
3882+
t.input =
3883+
"I'm\nthinking\n</think>\n"
3884+
"<tool_call>\n"
3885+
"<function=special_function>\n"
3886+
"<parameter=arg1>\n"
3887+
"1\n"
3888+
"</parameter>\n"
3889+
"</function>\n"
3890+
"</tool_call>\n"
3891+
"<tool_call>\n"
3892+
"<function=special_function_with_opt>\n"
3893+
"<parameter=arg1>\n"
3894+
"1\n"
3895+
"</parameter>\n"
3896+
"<parameter=arg2>\n"
3897+
"2\n"
3898+
"</parameter>\n"
3899+
"</function>\n"
3900+
"</tool_call>";
3901+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
3902+
t.params.parallel_tool_calls = true;
3903+
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
3904+
3905+
t.expect.reasoning_content = "I'm\nthinking";
3906+
t.expect.tool_calls = {{
3907+
/* .name = */ "special_function",
3908+
/* .arguments = */ R"({"arg1": 1})",
3909+
/* .id = */ {},
3910+
}, {
3911+
/* .name = */ "special_function_with_opt",
3912+
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
3913+
/* .id = */ {},
3914+
}};
3915+
});
3916+
3917+
// Test parallel tool calls without thinking content
3918+
test_peg_parser(tmpls.get(), [&](auto & t) {
3919+
t.input =
3920+
"</think>\n"
3921+
"<tool_call>\n"
3922+
"<function=special_function>\n"
3923+
"<parameter=arg1>\n"
3924+
"1\n"
3925+
"</parameter>\n"
3926+
"</function>\n"
3927+
"</tool_call>\n"
3928+
"<tool_call>\n"
3929+
"<function=special_function_with_opt>\n"
3930+
"<parameter=arg1>\n"
3931+
"1\n"
3932+
"</parameter>\n"
3933+
"<parameter=arg2>\n"
3934+
"2\n"
3935+
"</parameter>\n"
3936+
"</function>\n"
3937+
"</tool_call>";
3938+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
3939+
t.params.parallel_tool_calls = true;
3940+
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
3941+
3942+
t.expect.tool_calls = {{
3943+
/* .name = */ "special_function",
3944+
/* .arguments = */ R"({"arg1": 1})",
3945+
/* .id = */ {},
3946+
}, {
3947+
/* .name = */ "special_function_with_opt",
3948+
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
3949+
/* .id = */ {},
3950+
}};
3951+
});
3952+
3953+
// Test tool call with code string parameter
3954+
test_peg_parser(tmpls.get(), [&](auto & t) {
3955+
t.input =
3956+
"</think>\n"
3957+
"<tool_call>\n"
3958+
"<function=python>\n"
3959+
"<parameter=code>\n"
3960+
"def hello():\n"
3961+
" print(\"Hello, world!\")\n"
3962+
"\n"
3963+
"hello()\n"
3964+
"</parameter>\n"
3965+
"</function>\n"
3966+
"</tool_call>";
3967+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
3968+
t.params.tools = {python_tool};
3969+
3970+
t.expect.tool_calls = {{
3971+
/* .name = */ "python",
3972+
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
3973+
/* .id = */ {},
3974+
}};
3975+
});
3976+
3977+
// Test tool call with string parameter and no closing </parameter> tag
3978+
test_peg_parser(tmpls.get(), [&](auto & t) {
3979+
t.input =
3980+
"</think>\n"
3981+
"<tool_call>\n"
3982+
"<function=python>\n"
3983+
"<parameter=code>\n"
3984+
"def hello():\n"
3985+
" print(\"Hello, world!\")\n"
3986+
"\n"
3987+
"hello()\n"
3988+
"</function>\n"
3989+
"</tool_call>";
3990+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
3991+
t.params.tools = {python_tool};
3992+
3993+
t.expect.tool_calls = {{
3994+
/* .name = */ "python",
3995+
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
3996+
/* .id = */ {},
3997+
}};
3998+
});
3999+
4000+
// Test response format (JSON schema with thinking)
4001+
test_peg_parser(tmpls.get(), [&](auto & t) {
4002+
t.input =
4003+
"I need to output the invoice details in JSON\n"
4004+
"</think>\n"
4005+
R"({"amount": 123.45, "date": "2025-12-03"})";
4006+
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
4007+
t.params.json_schema = invoice_schema;
4008+
4009+
t.expect.reasoning_content = "I need to output the invoice details in JSON";
4010+
t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
4011+
});
4012+
}
4013+
38024014
{
38034015
// Solar-Open-100B
38044016
auto tmpls = read_templates("models/templates/upstage-Solar-Open-100B.jinja");

0 commit comments

Comments
 (0)