Skip to content

Commit 594a1f7

Browse files
authored
Merge pull request #29 from PolynomialDivision/strip-gemma4
Strip Gemma 4 thinking tokens from translation output
2 parents 7a4d939 + 9e897d7 commit 594a1f7

1 file changed

Lines changed: 20 additions & 4 deletions

File tree

ltengine/src/llm.rs

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,17 @@ impl LLM {
188188
// Use the model's embedded chat template when llama.cpp can detect it.
189189
// Falls back to hardcoded Gemma format when detection fails (e.g. Gemma 4
190190
// until llama-cpp-sys picks up the upstream Gemma 4 template detection fix).
191-
let llm_input = self.model
191+
let llm_input = match self.model
192192
.chat_template(None)
193193
.ok()
194194
.and_then(|tmpl| self.model.apply_chat_template(&tmpl, &messages, true).ok())
195-
.unwrap_or_else(|| format!(
196-
"<start_of_turn>user\n{system}\n\n{user}<end_of_turn>\n<start_of_turn>model\n"
197-
));
195+
{
196+
Some(s) => s,
197+
None => {
198+
eprintln!("ltengine: apply_chat_template failed: using hardcoded Gemma format");
199+
format!("<start_of_turn>user\n{system}\n\n{user}<end_of_turn>\n<start_of_turn>model\n")
200+
}
201+
};
198202

199203
// BOS is not added by apply_chat_template — str_to_token handles it.
200204
let tokens_list = self.model
@@ -278,6 +282,17 @@ impl LLMContext<'_>{
278282
self.ctx.decode(&mut batch).with_context(|| "Failed to eval")?;
279283
}
280284

285+
// Gemma 4 thinking mode emits thinking content before the actual response in two forms:
286+
// 1. <|channel>thought\n...<channel|>answer (full block with closing tag)
287+
// 2. <|channel>thought answer (no closing tag, space-separated)
288+
let output = if let Some(pos) = output.find("<channel|>") {
289+
output[pos + "<channel|>".len()..].to_owned()
290+
} else if let Some(rest) = output.strip_prefix("<|channel>thought") {
291+
rest.trim_start_matches(['\n', ' ']).to_owned()
292+
} else {
293+
output
294+
};
295+
281296
// Gemma may emit <end_of_turn> as literal text when it cannot translate
282297
// (e.g. unsupported language/format combination) instead of the special
283298
// EOG token caught above. Strip it and treat empty output as an error.
@@ -286,6 +301,7 @@ impl LLMContext<'_>{
286301
if output.is_empty() {
287302
return Err(anyhow::anyhow!("Model produced empty output"));
288303
}
304+
289305
Ok(output)
290306
}
291307
}

0 commit comments

Comments
 (0)