@@ -51,6 +51,9 @@ struct MLXServer: AsyncParsableCommand {
5151 @Option ( name: . long, help: " Number of parallel request slots " )
5252 var parallel : Int = 1
5353
54+ @Flag ( name: . long, help: " Enable thinking/reasoning mode (Qwen3.5 etc). Default: disabled " )
55+ var thinking : Bool = false
56+
5457 mutating func run( ) async throws {
5558 print ( " [mlx-server] Loading model: \( model) " )
5659 let modelId = model
@@ -72,6 +75,7 @@ struct MLXServer: AsyncParsableCommand {
7275 let defaultTemp = self . temp
7376 let defaultTopP = self . topP
7477 let defaultRepeatPenalty = self . repeatPenalty
78+ let thinkingEnabled = self . thinking
7579 let parallelSlots = self . parallel
7680
7781 // ── Concurrency limiter ──
@@ -141,7 +145,10 @@ struct MLXServer: AsyncParsableCommand {
141145 // ── Acquire slot (concurrency limiter) ──
142146 await semaphore. wait ( )
143147
144- let userInput = UserInput ( chat: chatMessages)
148+ // Pass enable_thinking to the Jinja chat template via additionalContext
149+ // (mirrors llama-server's --chat-template-kwargs '{"enable_thinking":false}')
150+ let templateContext : [ String : any Sendable ] ? = thinkingEnabled ? nil : [ " enable_thinking " : false ]
151+ let userInput = UserInput ( chat: chatMessages, additionalContext: templateContext)
145152 let lmInput = try await container. prepare ( input: userInput)
146153 let stream = try await container. generate ( input: lmInput, parameters: params)
147154
0 commit comments