Skip to content

Commit 91ee743

Browse files
simbasimba
authored andcommitted
feat: add --thinking flag to disable thinking mode by default (Qwen3.5)
1 parent 3e1f923 commit 91ee743

1 file changed

Lines changed: 8 additions & 1 deletion

File tree

Sources/mlx-server/main.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ struct MLXServer: AsyncParsableCommand {
5151
@Option(name: .long, help: "Number of parallel request slots")
5252
var parallel: Int = 1
5353

54+
@Flag(name: .long, help: "Enable thinking/reasoning mode (Qwen3.5 etc). Default: disabled")
55+
var thinking: Bool = false
56+
5457
mutating func run() async throws {
5558
print("[mlx-server] Loading model: \(model)")
5659
let modelId = model
@@ -72,6 +75,7 @@ struct MLXServer: AsyncParsableCommand {
7275
let defaultTemp = self.temp
7376
let defaultTopP = self.topP
7477
let defaultRepeatPenalty = self.repeatPenalty
78+
let thinkingEnabled = self.thinking
7579
let parallelSlots = self.parallel
7680

7781
// ── Concurrency limiter ──
@@ -141,7 +145,10 @@ struct MLXServer: AsyncParsableCommand {
141145
// ── Acquire slot (concurrency limiter) ──
142146
await semaphore.wait()
143147

144-
let userInput = UserInput(chat: chatMessages)
148+
// Pass enable_thinking to the Jinja chat template via additionalContext
149+
// (mirrors llama-server's --chat-template-kwargs '{"enable_thinking":false}')
150+
let templateContext: [String: any Sendable]? = thinkingEnabled ? nil : ["enable_thinking": false]
151+
let userInput = UserInput(chat: chatMessages, additionalContext: templateContext)
145152
let lmInput = try await container.prepare(input: userInput)
146153
let stream = try await container.generate(input: lmInput, parameters: params)
147154

0 commit comments

Comments
 (0)