diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt index de0b5f2d..9be56683 100644 --- a/dflash/CMakeLists.txt +++ b/dflash/CMakeLists.txt @@ -264,6 +264,18 @@ add_library(dflash_common STATIC src/server/sse_emitter.cpp src/server/prefix_cache.cpp src/server/disk_prefix_cache.cpp + # ── Jinja chat-template engine (from llama.cpp common/jinja/) ── + # Used by render_chat_template_jinja() to support --chat-template-file + # in dflash_server. Mirrors llama.cpp's common_chat_template plumbing. + # unicode.cpp supplies common_parse_utf8_codepoint() used by jinja's + # value.cpp tojson() and is otherwise self-contained. + deps/llama.cpp/common/jinja/lexer.cpp + deps/llama.cpp/common/jinja/parser.cpp + deps/llama.cpp/common/jinja/runtime.cpp + deps/llama.cpp/common/jinja/value.cpp + deps/llama.cpp/common/jinja/string.cpp + deps/llama.cpp/common/jinja/caps.cpp + deps/llama.cpp/common/unicode.cpp ) # BSA (Block-Sparse Attention) backs the speculative-prefill drafter scoring # path. Default ON so prefill is fast out of the box. Turn OFF if you don't @@ -452,6 +464,10 @@ target_include_directories(dflash_common PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/src + # Jinja chat-template engine (lexer/parser/runtime/value/string/caps) + # pulled from llama.cpp/common/jinja for --chat-template-file support. + # nlohmann_json is already linked PUBLIC (used by jinja/value.cpp). + ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/common ) if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") target_include_directories(dflash_common PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) diff --git a/dflash/src/server/chat_template.cpp b/dflash/src/server/chat_template.cpp index 92c46588..6086feae 100644 --- a/dflash/src/server/chat_template.cpp +++ b/dflash/src/server/chat_template.cpp @@ -2,6 +2,16 @@ #include "chat_template.h" +#include "jinja/lexer.h" +#include "jinja/parser.h" +#include "jinja/runtime.h" +#include "jinja/value.h" + +#include + +#include +#include + namespace dflash::common { // Qwen3.5 tool preamble — matches the official Jinja template exactly. @@ -155,4 +165,103 @@ std::string render_chat_template( return result; } +// ─── Jinja path ───────────────────────────────────────────────────────── +// +// Render via a Jinja chat template (e.g. froggeric Qwen3.6 template). Each +// thread caches the most-recently-parsed program for its template source, +// so steady-state cost is just the runtime execute (parse happens once per +// process per template). + +namespace { + +struct JinjaCache { + std::string src; + std::shared_ptr prog; +}; + +static thread_local JinjaCache tls_jinja_cache; + +static std::shared_ptr get_or_parse(const std::string & template_src) { + if (tls_jinja_cache.prog && tls_jinja_cache.src == template_src) { + return tls_jinja_cache.prog; + } + jinja::lexer lex; + jinja::lexer_result lex_res; + try { + lex_res = lex.tokenize(template_src); + } catch (const std::exception & e) { + throw std::runtime_error(std::string("jinja lexer: ") + e.what()); + } + auto prog = std::make_shared(jinja::parse_from_tokens(lex_res)); + tls_jinja_cache.src = template_src; + tls_jinja_cache.prog = prog; + return prog; +} + +} // namespace + +std::string render_chat_template_jinja( + const std::string & template_src, + const std::vector & messages, + const std::string & bos_token, + const std::string & eos_token, + bool add_generation_prompt, + bool enable_thinking, + const std::string & tools_json) +{ + if (template_src.empty()) { + throw std::runtime_error("render_chat_template_jinja: template_src is empty"); + } + + auto prog = get_or_parse(template_src); + + // Build the JSON input that mirrors llama.cpp's + // common_chat_template_direct_apply_impl. Field names must match the + // names the Jinja templates expect (messages, tools, bos_token, + // eos_token, add_generation_prompt, enable_thinking). + nlohmann::ordered_json messages_j = nlohmann::ordered_json::array(); + for (const auto & m : messages) { + nlohmann::ordered_json mj; + mj["role"] = m.role; + mj["content"] = m.content; + if (!m.tool_call_id.empty()) { + mj["tool_call_id"] = m.tool_call_id; + } + messages_j.push_back(std::move(mj)); + } + + nlohmann::ordered_json inputs; + inputs["messages"] = std::move(messages_j); + inputs["bos_token"] = bos_token; + inputs["eos_token"] = eos_token; + inputs["add_generation_prompt"] = add_generation_prompt; + inputs["enable_thinking"] = enable_thinking; + + bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null"; + if (has_tools) { + try { + inputs["tools"] = nlohmann::ordered_json::parse(tools_json); + } catch (const std::exception & e) { + throw std::runtime_error( + std::string("render_chat_template_jinja: failed to parse tools JSON: ") + e.what()); + } + } + + jinja::context ctx(template_src); + try { + jinja::global_from_json(ctx, inputs, /*mark_input=*/false); + } catch (const std::exception & e) { + throw std::runtime_error(std::string("jinja global_from_json: ") + e.what()); + } + + try { + jinja::runtime rt(ctx); + jinja::value results = rt.execute(*prog); + auto parts = jinja::runtime::gather_string_parts(results); + return parts->as_string().str(); + } catch (const std::exception & e) { + throw std::runtime_error(std::string("jinja runtime: ") + e.what()); + } +} + } // namespace dflash::common diff --git a/dflash/src/server/chat_template.h b/dflash/src/server/chat_template.h index 5f35f492..c51d7ef1 100644 --- a/dflash/src/server/chat_template.h +++ b/dflash/src/server/chat_template.h @@ -49,4 +49,30 @@ std::string render_chat_template( // Detect the appropriate chat format for an architecture. ChatFormat chat_format_for_arch(const std::string & arch); +// Render chat messages via a Jinja chat template (e.g. froggeric Qwen3.6 +// template, or any of the llama.cpp models/templates/*.jinja files). +// +// Mirrors llama.cpp's common_chat_template_direct_apply: parses the template +// once per thread, converts inputs to jinja values, runs the program, returns +// the rendered prompt string. +// +// `template_src` literal Jinja source (read from --chat-template-file) +// `bos_token`, +// `eos_token` passed through to the template (Qwen3.6 templates may use +// {{bos_token}} / {{eos_token}}). Use empty strings if unknown. +// `tools_json` optional JSON array of tool definitions; when non-empty it +// is parsed and injected as `tools` into the template context. +// +// Internally caches the most recently parsed program per thread (avoids +// re-parsing the template on every request). Throws std::runtime_error on +// lexer/parser/runtime failure (caller should surface a 500 response). +std::string render_chat_template_jinja( + const std::string & template_src, + const std::vector & messages, + const std::string & bos_token, + const std::string & eos_token, + bool add_generation_prompt = true, + bool enable_thinking = false, + const std::string & tools_json = ""); + } // namespace dflash::common diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp index 8188fd07..1fb4b0ad 100644 --- a/dflash/src/server/http_server.cpp +++ b/dflash/src/server/http_server.cpp @@ -439,9 +439,41 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { tools_json = req.tools.dump(); } - std::string rendered = render_chat_template(chat_msgs, chat_format_, - true, enable_thinking, - tools_json); + std::string rendered; + if (!config_.chat_template_src.empty()) { + // Jinja path: caller supplied a chat template file via + // --chat-template-file. Override the hardcoded QWEN3/LAGUNA + // renderer. Used for tool-using agents that need the Anthropic + // tool_use envelope (e.g. froggeric Qwen3.6 template). + // + // Special tokens like <|im_start|> / <|im_end|> are stored + // verbatim in the GGUF vocab — use raw_token() to skip the + // GPT-2 byte decode (otherwise <0xC4><0x91> nonsense appears). + const std::string & bos_str = (tokenizer_.bos_id() >= 0) + ? tokenizer_.raw_token(tokenizer_.bos_id()) + : std::string(); + const std::string & eos_str = (tokenizer_.eos_id() >= 0) + ? tokenizer_.raw_token(tokenizer_.eos_id()) + : std::string(); + try { + rendered = render_chat_template_jinja( + config_.chat_template_src, + chat_msgs, + bos_str, + eos_str, + /*add_generation_prompt=*/true, + enable_thinking, + tools_json); + } catch (const std::exception & e) { + send_error(fd, 500, + std::string("chat template (jinja) render failed: ") + e.what()); + return true; + } + } else { + rendered = render_chat_template(chat_msgs, chat_format_, + true, enable_thinking, + tools_json); + } req.prompt_tokens = tokenizer_.encode(rendered); // Detect if prompt ends with (model will start in reasoning mode). diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h index 24d075d8..45d73bf5 100644 --- a/dflash/src/server/http_server.h +++ b/dflash/src/server/http_server.h @@ -64,6 +64,12 @@ struct ServerConfig { int disk_cache_min_tokens = 512; // only persist >= this many tokens int disk_cache_continued_interval = 10240; // continued checkpoint every N tokens int disk_cache_cold_max_tokens = 10240; // cold prefix for prompts longer than this + + // Optional Jinja chat template (overrides the hardcoded ChatFormat::QWEN3 + // / LAGUNA renderer when non-empty). Used for tool-using agents that need + // the Anthropic tool_use envelope, e.g. froggeric Qwen3.6 template. + std::string chat_template_src; // literal Jinja source (loaded from file) + std::string chat_template_path; // path it was loaded from (logged at startup) }; // ─── Parsed request ───────────────────────────────────────────────────── diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp index 627e4c64..c8857934 100644 --- a/dflash/src/server/server_main.cpp +++ b/dflash/src/server/server_main.cpp @@ -76,6 +76,13 @@ static void print_usage(const char * prog) { " --kv-cache-min-tokens Min tokens to persist (default: 512)\n" " --kv-cache-interval Continued checkpoint every N tokens (default: 10240)\n" " --kv-cache-cold-max Cold prefix for prompts longer than N tokens (default: 10240)\n" + "\n" + "Chat template (optional, e.g. froggeric Qwen3.6 template for tool-using\n" + "agents that need the Anthropic tool_use envelope):\n" + " --chat-template-file Load a Jinja chat template file.\n" + " Overrides the hardcoded Qwen3/Laguna\n" + " renderer. Empty or missing falls back\n" + " to the hardcoded template.\n" "\n", prog); } @@ -143,6 +150,36 @@ int main(int argc, char ** argv) { sconfig.pflash_skip_park = true; } else if (std::strcmp(argv[i], "--lazy-draft") == 0) { sconfig.lazy_draft = true; + } else if (std::strcmp(argv[i], "--chat-template-file") == 0 && i + 1 < argc) { + const char * path = argv[++i]; + std::FILE * f = std::fopen(path, "rb"); + if (!f) { + std::fprintf(stderr, "[server] --chat-template-file: cannot open '%s'\n", path); + return 1; + } + std::fseek(f, 0, SEEK_END); + long n = std::ftell(f); + std::fseek(f, 0, SEEK_SET); + if (n <= 0) { + // The usage text promises "Empty or missing falls back to the + // hardcoded template." Honor that: log a warning and leave + // chat_template_src empty so http_server.cpp falls through to + // the hardcoded QWEN3/LAGUNA renderer, instead of aborting + // startup. + std::fclose(f); + std::fprintf(stderr, "[server] --chat-template-file: '%s' is empty, " + "falling back to hardcoded template\n", path); + } else { + sconfig.chat_template_src.resize((size_t)n); + size_t got = std::fread(sconfig.chat_template_src.data(), 1, (size_t)n, f); + std::fclose(f); + if (got != (size_t)n) { + std::fprintf(stderr, "[server] --chat-template-file: short read on '%s'\n", path); + return 1; + } + sconfig.chat_template_path = path; + std::fprintf(stderr, "[server] loaded chat template from %s (%ld bytes)\n", path, n); + } } else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) { sconfig.disk_cache_dir = argv[++i]; } else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) { diff --git a/dflash/test/test_server_unit.cpp b/dflash/test/test_server_unit.cpp index 848b7539..fa05346a 100644 --- a/dflash/test/test_server_unit.cpp +++ b/dflash/test/test_server_unit.cpp @@ -15,6 +15,7 @@ #include "server/utf8_utils.h" #include "server/api_types.h" #include "server/http_server.h" +#include "server/chat_template.h" #include #include @@ -576,6 +577,101 @@ static void test_pflash_threshold_always_mode() { TEST_ASSERT(should); } +// ═══════════════════════════════════════════════════════════════════════ +// Jinja chat template +// ═══════════════════════════════════════════════════════════════════════ + +// Minimal Jinja template: just join roles + contents. Used to verify the +// runtime + global_from_json plumbing without depending on any external +// .jinja file at test time. +static const char MINI_JINJA_TEMPLATE[] = + "{%- for m in messages -%}" + "<|{{ m.role }}|>{{ m.content }}\n" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "<|assistant|>" + "{%- endif -%}"; + +static void test_jinja_render_basic() { + std::vector msgs = { + {"system", "you are helpful", ""}, + {"user", "hi", ""}, + }; + std::string out = render_chat_template_jinja( + MINI_JINJA_TEMPLATE, msgs, + /*bos=*/"", /*eos=*/"", + /*add_gen=*/true, /*think=*/false, + /*tools=*/""); + TEST_ASSERT(out.find("<|system|>you are helpful") != std::string::npos); + TEST_ASSERT(out.find("<|user|>hi") != std::string::npos); + TEST_ASSERT(out.find("<|assistant|>") != std::string::npos); +} + +static void test_jinja_render_no_gen_prompt() { + std::vector msgs = {{"user", "ping", ""}}; + std::string out = render_chat_template_jinja( + MINI_JINJA_TEMPLATE, msgs, "", "", + /*add_gen=*/false, /*think=*/false, ""); + TEST_ASSERT(out.find("<|user|>ping") != std::string::npos); + TEST_ASSERT(out.find("<|assistant|>") == std::string::npos); +} + +static void test_jinja_render_tools_injected() { + // Template references `tools` to confirm it was passed in. + static const char TPL[] = + "{%- if tools -%}TOOLS_PRESENT:{{ tools[0].name }}{%- endif -%}" + "{%- for m in messages -%}<|{{ m.role }}|>{{ m.content }}{%- endfor -%}"; + std::vector msgs = {{"user", "?", ""}}; + std::string tools = R"([{"name":"my_tool","description":"test"}])"; + std::string out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, tools); + TEST_ASSERT(out.find("TOOLS_PRESENT:my_tool") != std::string::npos); +} + +static void test_jinja_render_empty_tools_skipped() { + // tools_json == "[]" must NOT define `tools` in the template context. + static const char TPL[] = + "{%- if tools -%}TOOLS_PRESENT{%- else -%}NO_TOOLS{%- endif -%}"; + std::vector msgs = {{"user", "?", ""}}; + std::string out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, "[]"); + TEST_ASSERT(out.find("NO_TOOLS") != std::string::npos); + TEST_ASSERT(out.find("TOOLS_PRESENT") == std::string::npos); +} + +static void test_jinja_render_bos_eos_threaded() { + // {{ bos_token }} and {{ eos_token }} must reach the template. + static const char TPL[] = "{{ bos_token }}HI{{ eos_token }}"; + std::vector msgs; + std::string out = render_chat_template_jinja( + TPL, msgs, "", "", false, false, ""); + TEST_ASSERT(out == "HI"); +} + +static void test_jinja_render_empty_template_throws() { + std::vector msgs = {{"user", "x", ""}}; + bool threw = false; + try { + (void)render_chat_template_jinja("", msgs, "", "", true, false, ""); + } catch (const std::runtime_error &) { + threw = true; + } + TEST_ASSERT(threw); +} + +static void test_jinja_render_bad_tools_json_throws() { + static const char TPL[] = "{%- for m in messages -%}{{ m.role }}{%- endfor -%}"; + std::vector msgs = {{"user", "x", ""}}; + bool threw = false; + try { + (void)render_chat_template_jinja( + TPL, msgs, "", "", true, false, "{not valid json"); + } catch (const std::runtime_error &) { + threw = true; + } + TEST_ASSERT(threw); +} + // ═══════════════════════════════════════════════════════════════════════ // Disk Prefix Cache Tests // ═══════════════════════════════════════════════════════════════════════ @@ -972,6 +1068,15 @@ int main() { RUN_TEST(test_pflash_threshold_auto_mode); RUN_TEST(test_pflash_threshold_always_mode); + std::fprintf(stderr, "\n── Jinja chat template ──\n"); + RUN_TEST(test_jinja_render_basic); + RUN_TEST(test_jinja_render_no_gen_prompt); + RUN_TEST(test_jinja_render_tools_injected); + RUN_TEST(test_jinja_render_empty_tools_skipped); + RUN_TEST(test_jinja_render_bos_eos_threaded); + RUN_TEST(test_jinja_render_empty_template_throws); + RUN_TEST(test_jinja_render_bad_tools_json_throws); + std::fprintf(stderr, "\n── Disk prefix cache ──\n"); RUN_TEST(test_disk_cache_config_defaults); RUN_TEST(test_disk_cache_disabled_when_no_dir);