Skip to content

Commit aaf998a

Browse files
authored
Merge pull request #99 from SharpAI/fix/qwen3-jinja-template-issue-97
fix(inference/ui): resolve Qwen3 template issue and harden SwiftBuddy server/settings (#97)
2 parents 0cd94eb + 321fc21 commit aaf998a

11 files changed

Lines changed: 976 additions & 124 deletions

File tree

Package.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ let package = Package(
117117
),
118118
.testTarget(
119119
name: "SwiftLMTests",
120-
dependencies: ["SwiftLM"]
120+
dependencies: ["SwiftLM", "MLXInferenceCore"]
121121
)
122122
]
123123
)
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// CLICommandBuilder.swift — Pure function for building the equivalent CLI command
2+
// Lives in MLXInferenceCore so it can be unit-tested by SwiftLMTests without
3+
// requiring the SwiftBuddy app target.
4+
import Foundation
5+
6+
/// Builds the equivalent `swift run SwiftLM` command string from persisted settings.
7+
/// Only emits flags that differ from the CLI defaults, keeping the command readable.
8+
///
9+
/// - Parameters:
10+
/// - config: The current `GenerationConfig`.
11+
/// - host: The server host string (e.g. "127.0.0.1").
12+
/// - port: The server port (e.g. 5413).
13+
/// - parallel: Number of parallel request slots (default 1).
14+
/// - apiKeySet: `true` if an API key is configured (key itself is redacted).
15+
/// - modelId: The currently loaded model ID, or `nil` when no model is loaded.
16+
/// - Returns: A multi-line shell command string suitable for display and copy.
17+
public func buildCLICommand(
18+
config: GenerationConfig,
19+
host: String,
20+
port: Int,
21+
parallel: Int,
22+
apiKeySet: Bool,
23+
modelId: String?
24+
) -> String {
25+
var parts: [String] = []
26+
27+
parts.append("--model \(modelId ?? "<model-id>")")
28+
parts.append("--host \(host)")
29+
parts.append("--port \(port)")
30+
parts.append("--max-tokens \(config.maxTokens)")
31+
parts.append("--temp \(String(format: "%.2f", config.temperature))")
32+
33+
if config.topP < 1.0 {
34+
parts.append("--top-p \(String(format: "%.2f", config.topP))")
35+
}
36+
if config.topK != 50 {
37+
parts.append("--top-k \(config.topK)")
38+
}
39+
if config.minP > 0 {
40+
parts.append("--min-p \(String(format: "%.2f", config.minP))")
41+
}
42+
if config.repetitionPenalty != 1.05 {
43+
parts.append("--repeat-penalty \(String(format: "%.2f", config.repetitionPenalty))")
44+
}
45+
if config.prefillSize != 512 {
46+
parts.append("--prefill-size \(config.prefillSize)")
47+
}
48+
if let kvBits = config.kvBits {
49+
parts.append("--kv-bits \(kvBits)")
50+
if config.kvGroupSize != 64 {
51+
parts.append("--kv-group-size \(config.kvGroupSize)")
52+
}
53+
}
54+
if config.enableThinking {
55+
parts.append("--thinking")
56+
}
57+
if let seed = config.seed {
58+
parts.append("--seed \(seed)")
59+
}
60+
if parallel > 1 {
61+
parts.append("--parallel \(parallel)")
62+
}
63+
if apiKeySet {
64+
parts.append("--api-key <redacted>")
65+
}
66+
67+
return "swift run SwiftLM " + parts.joined(separator: " \\\n ")
68+
}
Lines changed: 64 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,29 @@
11
// GenerationConfig.swift — SwiftLM inference parameters
22
import Foundation
33

4-
/// Configuration for a single generation request.
5-
public struct GenerationConfig: Sendable {
4+
/// Per-request generation parameters, persisted across app launches via UserDefaults.
5+
///
6+
/// ### Field classification
7+
/// **Per-request** (applied on every `generate()` call — no reload needed):
8+
/// temperature, topP, topK, minP, repetitionPenalty, seed, enableThinking,
9+
/// prefillSize, kvBits, kvGroupSize, turboKV
10+
///
11+
/// **Load-time** (requires model reload to take effect):
12+
/// streamExperts — controls SSD expert streaming for MoE and large models.
13+
/// Stored here for persistence but applied by InferenceEngine at load time.
14+
public struct GenerationConfig: Sendable, Codable {
615
public var maxTokens: Int
716
public var temperature: Float
817
public var topP: Float
918
public var topK: Int
1019
public var minP: Float
1120
public var repetitionPenalty: Float
12-
public var seed: UInt64?
13-
public var enableThinking: Bool
1421

15-
// ── SwiftLM Engine Parameters ──────────────────────────────────────
16-
/// Enable TurboQuant KV-cache compression (3-bit PolarQuant+QJL).
17-
/// Compresses KV history > 8192 tokens to ~3.5 bits/token.
18-
public var turboKV: Bool
22+
/// Optional RNG seed for reproducible outputs.
23+
/// When non-nil, `MLX.seed(seed)` is called before each generation using this `UInt64` value.
24+
public var seed: UInt64?
1925

20-
/// Enable SSD expert streaming for MoE models.
21-
public var streamExperts: Bool
26+
public var enableThinking: Bool
2227

2328
/// Chunk size for prefill evaluation.
2429
/// Lower values prevent GPU timeout on large models.
@@ -30,6 +35,21 @@ public struct GenerationConfig: Sendable {
3035
/// KV-cache quantization group size (default 64).
3136
public var kvGroupSize: Int
3237

38+
/// Enable 3-bit TurboQuant KV-cache compression (PolarQuant+QJL).
39+
/// Compresses KV history older than 8192 tokens to ~3.5 bits/token.
40+
/// Recommended for 100k+ context to halve KV RAM usage.
41+
/// Applied per-request — no model reload needed.
42+
public var turboKV: Bool
43+
44+
/// Enable SSD expert streaming for MoE (and any large) models.
45+
/// When true, expert weights are mmap'd from NVMe and only active
46+
/// expert pages reside in RAM during inference (Flash-MoE style).
47+
/// ⚠️ LOAD-TIME flag: changes take effect on the next model load.
48+
/// MoE models (isMoE == true) default to true automatically;
49+
/// this flag lets users override that for non-catalog models or
50+
/// force-disable streaming even on MoE models.
51+
public var streamExperts: Bool
52+
3353
public init(
3454
maxTokens: Int = 2048,
3555
temperature: Float = 0.6,
@@ -39,11 +59,11 @@ public struct GenerationConfig: Sendable {
3959
repetitionPenalty: Float = 1.05,
4060
seed: UInt64? = nil,
4161
enableThinking: Bool = false,
42-
turboKV: Bool = false,
43-
streamExperts: Bool = false,
4462
prefillSize: Int = 512,
4563
kvBits: Int? = nil,
46-
kvGroupSize: Int = 64
64+
kvGroupSize: Int = 64,
65+
turboKV: Bool = false,
66+
streamExperts: Bool = false
4767
) {
4868
self.maxTokens = maxTokens
4969
self.temperature = temperature
@@ -53,12 +73,41 @@ public struct GenerationConfig: Sendable {
5373
self.repetitionPenalty = repetitionPenalty
5474
self.seed = seed
5575
self.enableThinking = enableThinking
56-
self.turboKV = turboKV
57-
self.streamExperts = streamExperts
5876
self.prefillSize = prefillSize
5977
self.kvBits = kvBits
6078
self.kvGroupSize = kvGroupSize
79+
self.turboKV = turboKV
80+
self.streamExperts = streamExperts
6181
}
6282

6383
public static let `default` = GenerationConfig()
84+
85+
// MARK: — Persistence
86+
87+
private static let storageKey = "swiftlm.generationConfig"
88+
89+
/// True when the user has previously saved a GenerationConfig.
90+
/// Used to distinguish the first-run/default state from an explicit choice.
91+
public static var hasPersistedConfig: Bool {
92+
UserDefaults.standard.object(forKey: storageKey) != nil
93+
}
94+
95+
/// Computes the effective SSD streaming setting.
96+
/// Before the user has saved settings, MoE models default to streaming on.
97+
/// After settings are persisted, the saved toggle becomes authoritative.
98+
public func effectiveStreamExperts(defaultingTo defaultValue: Bool) -> Bool {
99+
Self.hasPersistedConfig ? streamExperts : defaultValue
100+
}
101+
102+
public func save() {
103+
guard let data = try? JSONEncoder().encode(self) else { return }
104+
UserDefaults.standard.set(data, forKey: Self.storageKey)
105+
}
106+
107+
public static func load() -> GenerationConfig {
108+
guard let data = UserDefaults.standard.data(forKey: storageKey),
109+
let decoded = try? JSONDecoder().decode(GenerationConfig.self, from: data)
110+
else { return .default }
111+
return decoded
112+
}
64113
}

0 commit comments

Comments
 (0)