Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions dflash/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,18 @@ add_library(dflash_common STATIC
src/server/sse_emitter.cpp
src/server/prefix_cache.cpp
src/server/disk_prefix_cache.cpp
# ── Jinja chat-template engine (from llama.cpp common/jinja/) ──
# Used by render_chat_template_jinja() to support --chat-template-file
# in dflash_server. Mirrors llama.cpp's common_chat_template plumbing.
# unicode.cpp supplies common_parse_utf8_codepoint() used by jinja's
# value.cpp tojson() and is otherwise self-contained.
deps/llama.cpp/common/jinja/lexer.cpp
deps/llama.cpp/common/jinja/parser.cpp
deps/llama.cpp/common/jinja/runtime.cpp
deps/llama.cpp/common/jinja/value.cpp
deps/llama.cpp/common/jinja/string.cpp
deps/llama.cpp/common/jinja/caps.cpp
deps/llama.cpp/common/unicode.cpp
)
# BSA (Block-Sparse Attention) backs the speculative-prefill drafter scoring
# path. Default ON so prefill is fast out of the box. Turn OFF if you don't
Expand Down Expand Up @@ -452,6 +464,10 @@ target_include_directories(dflash_common
PRIVATE
${DFLASH27B_SRC_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/src
# Jinja chat-template engine (lexer/parser/runtime/value/string/caps)
# pulled from llama.cpp/common/jinja for --chat-template-file support.
# nlohmann_json is already linked PUBLIC (used by jinja/value.cpp).
${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/common
)
if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
target_include_directories(dflash_common PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
Expand Down
109 changes: 109 additions & 0 deletions dflash/src/server/chat_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@

#include "chat_template.h"

#include "jinja/lexer.h"
#include "jinja/parser.h"
#include "jinja/runtime.h"
#include "jinja/value.h"

#include <nlohmann/json.hpp>

#include <memory>
#include <stdexcept>

namespace dflash::common {

// Qwen3.5 tool preamble — matches the official Jinja template exactly.
Expand Down Expand Up @@ -155,4 +165,103 @@ std::string render_chat_template(
return result;
}

// ─── Jinja path ─────────────────────────────────────────────────────────
//
// Render via a Jinja chat template (e.g. froggeric Qwen3.6 template). Each
// thread caches the most-recently-parsed program for its template source,
// so steady-state cost is just the runtime execute (parse happens once per
// process per template).

namespace {

struct JinjaCache {
std::string src;
std::shared_ptr<jinja::program> prog;
};

static thread_local JinjaCache tls_jinja_cache;

static std::shared_ptr<jinja::program> get_or_parse(const std::string & template_src) {
if (tls_jinja_cache.prog && tls_jinja_cache.src == template_src) {
return tls_jinja_cache.prog;
}
jinja::lexer lex;
jinja::lexer_result lex_res;
try {
lex_res = lex.tokenize(template_src);
} catch (const std::exception & e) {
throw std::runtime_error(std::string("jinja lexer: ") + e.what());
}
auto prog = std::make_shared<jinja::program>(jinja::parse_from_tokens(lex_res));
tls_jinja_cache.src = template_src;
tls_jinja_cache.prog = prog;
return prog;
}

} // namespace

std::string render_chat_template_jinja(
const std::string & template_src,
const std::vector<ChatMessage> & messages,
const std::string & bos_token,
const std::string & eos_token,
bool add_generation_prompt,
bool enable_thinking,
const std::string & tools_json)
{
if (template_src.empty()) {
throw std::runtime_error("render_chat_template_jinja: template_src is empty");
}

auto prog = get_or_parse(template_src);

// Build the JSON input that mirrors llama.cpp's
// common_chat_template_direct_apply_impl. Field names must match the
// names the Jinja templates expect (messages, tools, bos_token,
// eos_token, add_generation_prompt, enable_thinking).
nlohmann::ordered_json messages_j = nlohmann::ordered_json::array();
for (const auto & m : messages) {
nlohmann::ordered_json mj;
mj["role"] = m.role;
mj["content"] = m.content;
if (!m.tool_call_id.empty()) {
mj["tool_call_id"] = m.tool_call_id;
}
messages_j.push_back(std::move(mj));
}

nlohmann::ordered_json inputs;
inputs["messages"] = std::move(messages_j);
inputs["bos_token"] = bos_token;
inputs["eos_token"] = eos_token;
inputs["add_generation_prompt"] = add_generation_prompt;
inputs["enable_thinking"] = enable_thinking;

bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null";
if (has_tools) {
try {
inputs["tools"] = nlohmann::ordered_json::parse(tools_json);
} catch (const std::exception & e) {
throw std::runtime_error(
std::string("render_chat_template_jinja: failed to parse tools JSON: ") + e.what());
}
}

jinja::context ctx(template_src);
try {
jinja::global_from_json(ctx, inputs, /*mark_input=*/false);
} catch (const std::exception & e) {
throw std::runtime_error(std::string("jinja global_from_json: ") + e.what());
}

try {
jinja::runtime rt(ctx);
jinja::value results = rt.execute(*prog);
auto parts = jinja::runtime::gather_string_parts(results);
return parts->as_string().str();
} catch (const std::exception & e) {
throw std::runtime_error(std::string("jinja runtime: ") + e.what());
}
}

} // namespace dflash::common
26 changes: 26 additions & 0 deletions dflash/src/server/chat_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,30 @@ std::string render_chat_template(
// Detect the appropriate chat format for an architecture.
ChatFormat chat_format_for_arch(const std::string & arch);

// Render chat messages via a Jinja chat template (e.g. froggeric Qwen3.6
// template, or any of the llama.cpp models/templates/*.jinja files).
//
// Mirrors llama.cpp's common_chat_template_direct_apply: parses the template
// once per thread, converts inputs to jinja values, runs the program, returns
// the rendered prompt string.
//
// `template_src` literal Jinja source (read from --chat-template-file)
// `bos_token`,
// `eos_token` passed through to the template (Qwen3.6 templates may use
// {{bos_token}} / {{eos_token}}). Use empty strings if unknown.
// `tools_json` optional JSON array of tool definitions; when non-empty it
// is parsed and injected as `tools` into the template context.
//
// Internally caches the most recently parsed program per thread (avoids
// re-parsing the template on every request). Throws std::runtime_error on
// lexer/parser/runtime failure (caller should surface a 500 response).
std::string render_chat_template_jinja(
const std::string & template_src,
const std::vector<ChatMessage> & messages,
const std::string & bos_token,
const std::string & eos_token,
bool add_generation_prompt = true,
bool enable_thinking = false,
const std::string & tools_json = "");

} // namespace dflash::common
38 changes: 35 additions & 3 deletions dflash/src/server/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,9 +439,41 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
tools_json = req.tools.dump();
}

std::string rendered = render_chat_template(chat_msgs, chat_format_,
true, enable_thinking,
tools_json);
std::string rendered;
if (!config_.chat_template_src.empty()) {
// Jinja path: caller supplied a chat template file via
// --chat-template-file. Override the hardcoded QWEN3/LAGUNA
// renderer. Used for tool-using agents that need the Anthropic
// tool_use envelope (e.g. froggeric Qwen3.6 template).
//
// Special tokens like <|im_start|> / <|im_end|> are stored
// verbatim in the GGUF vocab — use raw_token() to skip the
// GPT-2 byte decode (otherwise <0xC4><0x91> nonsense appears).
const std::string & bos_str = (tokenizer_.bos_id() >= 0)
? tokenizer_.raw_token(tokenizer_.bos_id())
: std::string();
const std::string & eos_str = (tokenizer_.eos_id() >= 0)
? tokenizer_.raw_token(tokenizer_.eos_id())
: std::string();
try {
rendered = render_chat_template_jinja(
config_.chat_template_src,
chat_msgs,
bos_str,
eos_str,
/*add_generation_prompt=*/true,
enable_thinking,
tools_json);
} catch (const std::exception & e) {
send_error(fd, 500,
std::string("chat template (jinja) render failed: ") + e.what());
return true;
}
} else {
rendered = render_chat_template(chat_msgs, chat_format_,
true, enable_thinking,
tools_json);
}
req.prompt_tokens = tokenizer_.encode(rendered);

// Detect if prompt ends with <think> (model will start in reasoning mode).
Expand Down
6 changes: 6 additions & 0 deletions dflash/src/server/http_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ struct ServerConfig {
int disk_cache_min_tokens = 512; // only persist >= this many tokens
int disk_cache_continued_interval = 10240; // continued checkpoint every N tokens
int disk_cache_cold_max_tokens = 10240; // cold prefix for prompts longer than this

// Optional Jinja chat template (overrides the hardcoded ChatFormat::QWEN3
// / LAGUNA renderer when non-empty). Used for tool-using agents that need
// the Anthropic tool_use envelope, e.g. froggeric Qwen3.6 template.
std::string chat_template_src; // literal Jinja source (loaded from file)
std::string chat_template_path; // path it was loaded from (logged at startup)
};

// ─── Parsed request ─────────────────────────────────────────────────────
Expand Down
37 changes: 37 additions & 0 deletions dflash/src/server/server_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ static void print_usage(const char * prog) {
" --kv-cache-min-tokens <N> Min tokens to persist (default: 512)\n"
" --kv-cache-interval <N> Continued checkpoint every N tokens (default: 10240)\n"
" --kv-cache-cold-max <N> Cold prefix for prompts longer than N tokens (default: 10240)\n"
"\n"
"Chat template (optional, e.g. froggeric Qwen3.6 template for tool-using\n"
"agents that need the Anthropic tool_use envelope):\n"
" --chat-template-file <path> Load a Jinja chat template file.\n"
" Overrides the hardcoded Qwen3/Laguna\n"
" renderer. Empty or missing falls back\n"
" to the hardcoded template.\n"
"\n", prog);
}

Expand Down Expand Up @@ -143,6 +150,36 @@ int main(int argc, char ** argv) {
sconfig.pflash_skip_park = true;
} else if (std::strcmp(argv[i], "--lazy-draft") == 0) {
sconfig.lazy_draft = true;
} else if (std::strcmp(argv[i], "--chat-template-file") == 0 && i + 1 < argc) {
const char * path = argv[++i];
std::FILE * f = std::fopen(path, "rb");
if (!f) {
std::fprintf(stderr, "[server] --chat-template-file: cannot open '%s'\n", path);
return 1;
}
std::fseek(f, 0, SEEK_END);
long n = std::ftell(f);
std::fseek(f, 0, SEEK_SET);
if (n <= 0) {
// The usage text promises "Empty or missing falls back to the
// hardcoded template." Honor that: log a warning and leave
// chat_template_src empty so http_server.cpp falls through to
// the hardcoded QWEN3/LAGUNA renderer, instead of aborting
// startup.
std::fclose(f);
std::fprintf(stderr, "[server] --chat-template-file: '%s' is empty, "
"falling back to hardcoded template\n", path);
} else {
sconfig.chat_template_src.resize((size_t)n);
size_t got = std::fread(sconfig.chat_template_src.data(), 1, (size_t)n, f);
std::fclose(f);
if (got != (size_t)n) {
std::fprintf(stderr, "[server] --chat-template-file: short read on '%s'\n", path);
return 1;
}
sconfig.chat_template_path = path;
std::fprintf(stderr, "[server] loaded chat template from %s (%ld bytes)\n", path, n);
}
} else if (std::strcmp(argv[i], "--kv-cache-dir") == 0 && i + 1 < argc) {
sconfig.disk_cache_dir = argv[++i];
} else if (std::strcmp(argv[i], "--kv-cache-budget") == 0 && i + 1 < argc) {
Expand Down
Loading
Loading