Luce-Org · davide221 · May 22, 2026 · May 21, 2026 · May 21, 2026
diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
@@ -264,6 +264,18 @@ add_library(dflash_common STATIC
     src/server/sse_emitter.cpp
     src/server/prefix_cache.cpp
     src/server/disk_prefix_cache.cpp
+    # ── Jinja chat-template engine (from llama.cpp common/jinja/) ──
+    # Used by render_chat_template_jinja() to support --chat-template-file
+    # in dflash_server. Mirrors llama.cpp's common_chat_template plumbing.
+    # unicode.cpp supplies common_parse_utf8_codepoint() used by jinja's
+    # value.cpp tojson() and is otherwise self-contained.
+    deps/llama.cpp/common/jinja/lexer.cpp
+    deps/llama.cpp/common/jinja/parser.cpp
+    deps/llama.cpp/common/jinja/runtime.cpp
+    deps/llama.cpp/common/jinja/value.cpp
+    deps/llama.cpp/common/jinja/string.cpp
+    deps/llama.cpp/common/jinja/caps.cpp
+    deps/llama.cpp/common/unicode.cpp
 )
 # BSA (Block-Sparse Attention) backs the speculative-prefill drafter scoring
 # path. Default ON so prefill is fast out of the box. Turn OFF if you don't
@@ -452,6 +464,10 @@ target_include_directories(dflash_common
     PRIVATE
         ${DFLASH27B_SRC_INCLUDE_DIRS}
         ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/src
+        # Jinja chat-template engine (lexer/parser/runtime/value/string/caps)
+        # pulled from llama.cpp/common/jinja for --chat-template-file support.
+        # nlohmann_json is already linked PUBLIC (used by jinja/value.cpp).
+        ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/common
 )
 if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
     target_include_directories(dflash_common PRIVATE ${CUDAToolkit_INCLUDE_DIRS})

diff --git a/dflash/src/server/chat_template.cpp b/dflash/src/server/chat_template.cpp
@@ -2,6 +2,16 @@
 
 #include "chat_template.h"
 
+#include "jinja/lexer.h"
+#include "jinja/parser.h"
+#include "jinja/runtime.h"
+#include "jinja/value.h"
+
+#include <nlohmann/json.hpp>
+
+#include <memory>
+#include <stdexcept>
+
 namespace dflash::common {
 
 // Qwen3.5 tool preamble — matches the official Jinja template exactly.
@@ -155,4 +165,103 @@ std::string render_chat_template(
     return result;
 }
 
+// ─── Jinja path ─────────────────────────────────────────────────────────
+//
+// Render via a Jinja chat template (e.g. froggeric Qwen3.6 template). Each
+// thread caches the most-recently-parsed program for its template source,
+// so steady-state cost is just the runtime execute (parse happens once per
+// process per template).
+
+namespace {
+
+struct JinjaCache {
+    std::string                       src;
+    std::shared_ptr<jinja::program>   prog;
+};
+
+static thread_local JinjaCache tls_jinja_cache;
+
+static std::shared_ptr<jinja::program> get_or_parse(const std::string & template_src) {
+    if (tls_jinja_cache.prog && tls_jinja_cache.src == template_src) {
+        return tls_jinja_cache.prog;
+    }
+    jinja::lexer lex;
+    jinja::lexer_result lex_res;
+    try {
+        lex_res = lex.tokenize(template_src);
+    } catch (const std::exception & e) {
+        throw std::runtime_error(std::string("jinja lexer: ") + e.what());
+    }
+    auto prog = std::make_shared<jinja::program>(jinja::parse_from_tokens(lex_res));
+    tls_jinja_cache.src  = template_src;
+    tls_jinja_cache.prog = prog;
+    return prog;
+}
+
+}  // namespace
+
+std::string render_chat_template_jinja(
+    const std::string & template_src,
+    const std::vector<ChatMessage> & messages,
+    const std::string & bos_token,
+    const std::string & eos_token,
+    bool add_generation_prompt,
+    bool enable_thinking,
+    const std::string & tools_json)
+{
+    if (template_src.empty()) {
+        throw std::runtime_error("render_chat_template_jinja: template_src is empty");
+    }
+
+    auto prog = get_or_parse(template_src);
+
+    // Build the JSON input that mirrors llama.cpp's
+    // common_chat_template_direct_apply_impl. Field names must match the
+    // names the Jinja templates expect (messages, tools, bos_token,
+    // eos_token, add_generation_prompt, enable_thinking).
+    nlohmann::ordered_json messages_j = nlohmann::ordered_json::array();
+    for (const auto & m : messages) {
+        nlohmann::ordered_json mj;
+        mj["role"]    = m.role;
+        mj["content"] = m.content;
+        if (!m.tool_call_id.empty()) {
+            mj["tool_call_id"] = m.tool_call_id;
+        }
+        messages_j.push_back(std::move(mj));
+    }
+
+    nlohmann::ordered_json inputs;
+    inputs["messages"]              = std::move(messages_j);
+    inputs["bos_token"]             = bos_token;
+    inputs["eos_token"]             = eos_token;
+    inputs["add_generation_prompt"] = add_generation_prompt;
+    inputs["enable_thinking"]       = enable_thinking;
+
+    bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null";
+    if (has_tools) {
+        try {
+            inputs["tools"] = nlohmann::ordered_json::parse(tools_json);
+        } catch (const std::exception & e) {
+            throw std::runtime_error(
+                std::string("render_chat_template_jinja: failed to parse tools JSON: ") + e.what());
+        }
+    }
+
+    jinja::context ctx(template_src);
+    try {
+        jinja::global_from_json(ctx, inputs, /*mark_input=*/false);
+    } catch (const std::exception & e) {
+        throw std::runtime_error(std::string("jinja global_from_json: ") + e.what());
+    }
+
+    try {
+        jinja::runtime rt(ctx);
+        jinja::value results = rt.execute(*prog);
+        auto parts = jinja::runtime::gather_string_parts(results);
+        return parts->as_string().str();
+    } catch (const std::exception & e) {
+        throw std::runtime_error(std::string("jinja runtime: ") + e.what());
+    }
+}
+
 }  // namespace dflash::common
diff --git a/dflash/src/server/chat_template.h b/dflash/src/server/chat_template.h
@@ -49,4 +49,30 @@ std::string render_chat_template(
 // Detect the appropriate chat format for an architecture.
 ChatFormat chat_format_for_arch(const std::string & arch);
 
+// Render chat messages via a Jinja chat template (e.g. froggeric Qwen3.6
+// template, or any of the llama.cpp models/templates/*.jinja files).
+//
+// Mirrors llama.cpp's common_chat_template_direct_apply: parses the template
+// once per thread, converts inputs to jinja values, runs the program, returns
+// the rendered prompt string.
+//
+// `template_src`  literal Jinja source (read from --chat-template-file)
+// `bos_token`,
+// `eos_token`    passed through to the template (Qwen3.6 templates may use
+//                {{bos_token}} / {{eos_token}}). Use empty strings if unknown.
+// `tools_json`   optional JSON array of tool definitions; when non-empty it
+//                is parsed and injected as `tools` into the template context.
+//
+// Internally caches the most recently parsed program per thread (avoids
+// re-parsing the template on every request). Throws std::runtime_error on
+// lexer/parser/runtime failure (caller should surface a 500 response).
+std::string render_chat_template_jinja(
+    const std::string & template_src,
+    const std::vector<ChatMessage> & messages,
+    const std::string & bos_token,
+    const std::string & eos_token,
+    bool add_generation_prompt = true,
+    bool enable_thinking = false,
+    const std::string & tools_json = "");
+
 }  // namespace dflash::common
diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp
@@ -439,9 +439,41 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
             tools_json = req.tools.dump();
         }
 
-        std::string rendered = render_chat_template(chat_msgs, chat_format_,
-                                                    true, enable_thinking,
-                                                    tools_json);
+        std::string rendered;
+        if (!config_.chat_template_src.empty()) {
+            // Jinja path: caller supplied a chat template file via
+            // --chat-template-file. Override the hardcoded QWEN3/LAGUNA
+            // renderer. Used for tool-using agents that need the Anthropic
+            // tool_use envelope (e.g. froggeric Qwen3.6 template).
+            //
+            // Special tokens like <|im_start|> / <|im_end|> are stored
+            // verbatim in the GGUF vocab — use raw_token() to skip the
+            // GPT-2 byte decode (otherwise <0xC4><0x91> nonsense appears).
+            const std::string & bos_str = (tokenizer_.bos_id() >= 0)
+                ? tokenizer_.raw_token(tokenizer_.bos_id())
+                : std::string();
+            const std::string & eos_str = (tokenizer_.eos_id() >= 0)
+                ? tokenizer_.raw_token(tokenizer_.eos_id())
+                : std::string();
+            try {
+                rendered = render_chat_template_jinja(
+                    config_.chat_template_src,
+                    chat_msgs,
+                    bos_str,
+                    eos_str,
+                    /*add_generation_prompt=*/true,
+                    enable_thinking,
+                    tools_json);
+            } catch (const std::exception & e) {
+                send_error(fd, 500,
+                    std::string("chat template (jinja) render failed: ") + e.what());
+                return true;
+            }
+        } else {
+            rendered = render_chat_template(chat_msgs, chat_format_,
+                                            true, enable_thinking,
+                                            tools_json);
+        }
         req.prompt_tokens = tokenizer_.encode(rendered);
 
         // Detect if prompt ends with <think> (model will start in reasoning mode).

diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h
@@ -64,6 +64,12 @@ struct ServerConfig {
     int         disk_cache_min_tokens = 512; // only persist >= this many tokens
     int         disk_cache_continued_interval = 10240; // continued checkpoint every N tokens
     int         disk_cache_cold_max_tokens = 10240;    // cold prefix for prompts longer than this
+
+    // Optional Jinja chat template (overrides the hardcoded ChatFormat::QWEN3
+    // / LAGUNA renderer when non-empty). Used for tool-using agents that need
+    // the Anthropic tool_use envelope, e.g. froggeric Qwen3.6 template.
+    std::string chat_template_src;          // literal Jinja source (loaded from file)
+    std::string chat_template_path;         // path it was loaded from (logged at startup)
 };
 
 // ─── Parsed request ─────────────────────────────────────────────────────

diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp
@@ -76,6 +76,13 @@ static void print_usage(const char * prog) {
         "  --kv-cache-min-tokens <N>   Min tokens to persist (default: 512)\n"
         "  --kv-cache-interval <N>     Continued checkpoint every N tokens (default: 10240)\n"
         "  --kv-cache-cold-max <N>     Cold prefix for prompts longer than N tokens (default: 10240)\n"
+        "\n"
+        "Chat template (optional, e.g. froggeric Qwen3.6 template for tool-using\n"
+        "agents that need the Anthropic tool_use envelope):\n"
+        "  --chat-template-file <path>  Load a Jinja chat template file.\n"
+        "                               Overrides the hardcoded Qwen3/Laguna\n"
+        "                               renderer. Empty or missing falls back\n"
+        "                               to the hardcoded template.\n"
         "\n", prog);
 }
 
@@ -143,6 +150,36 @@ int main(int argc, char ** argv) {
             sconfig.pflash_skip_park = true;
         } else if (std::strcmp(argv[i], "--lazy-draft") == 0) {
             sconfig.lazy_draft = true;
+        } else if (std::strcmp(argv[i], "--chat-template-file") == 0 && i + 1 < argc) {
+            const char * path = argv[++i];
+            std::FILE * f = std::fopen(path, "rb");
+            if (!f) {
+                std::fprintf(stderr, "[server] --chat-template-file: cannot open '%s'\n", path);
+                return 1;
+            }
+            std::fseek(f, 0, SEEK_END);
+            long n = std::ftell(f);
+            std::fseek(f, 0, SEEK_SET);
+            if (n <= 0) {
+                // The usage text promises "Empty or missing falls back to the
+                // hardcoded template." Honor that: log a warning and leave
+                // chat_template_src empty so http_server.cpp falls through to
+                // the hardcoded QWEN3/LAGUNA renderer, instead of aborting
+                // startup.
+                std::fclose(f);
+                std::fprintf(stderr, "[server] --chat-template-file: '%s' is empty, "
+                                     "falling back to hardcoded template\n", path);
+            } else {
+                sconfig.chat_template_src.resize((size_t)n);
+                size_t got = std::fread(sconfig.chat_template_src.data(), 1, (size_t)n, f);
+                std::fclose(f);
+                if (got != (size_t)n) {
+                    std::fprintf(stderr, "[server] --chat-template-file: short read on '%s'\n", path);
+                    return 1;
+                }
+                sconfig.chat_template_path = path;
+                std::fprintf(stderr, "[server] loaded chat template from %s (%ld bytes)\n", path, n);
+            }
         } else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) {
             sconfig.disk_cache_dir = argv[++i];
         } else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) {