Skip to content

Commit 480e349

Browse files
committed
refactor: rename project from mlx-server to SwiftLM
- Rename Sources/mlx-server/ → Sources/SwiftLM/ - Update Package.swift: package name, target name, source path - Update all [mlx-server] log prefixes to [SwiftLM] - Update ~/.mlx-server/wisdom/ path to ~/.swiftlm/wisdom/ - Update CLI commandName to SwiftLM - Update GitHub Actions workflows: binary path, tarball names, release titles - Update all documentation files
1 parent 2ca5b02 commit 480e349

11 files changed

Lines changed: 94 additions & 94 deletions

File tree

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ jobs:
3333

3434
- name: Verify binary
3535
run: |
36-
ls -lh .build/release/mlx-server
37-
file .build/release/mlx-server
36+
ls -lh .build/release/SwiftLM
37+
file .build/release/SwiftLM
3838
3939
- name: TurboQuant unit tests
4040
run: |

.github/workflows/e2e-test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
# Retry up to 2 times for transient HuggingFace download failures
6161
for attempt in 1 2 3; do
6262
echo "Attempt $attempt of 3..."
63-
if tests/test-server.sh .build/release/mlx-server 15413; then
63+
if tests/test-server.sh .build/release/SwiftLM 15413; then
6464
exit 0
6565
fi
6666
if [ "$attempt" -lt 3 ]; then
@@ -76,5 +76,5 @@ jobs:
7676
uses: actions/upload-artifact@v4
7777
with:
7878
name: e2e-test-logs
79-
path: /tmp/mlx-server-test-*.log
79+
path: /tmp/SwiftLM-test-*.log
8080
retention-days: 7

.github/workflows/release.yml

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -65,31 +65,31 @@ jobs:
6565

6666
- name: Verify binary
6767
run: |
68-
ls -lh .build/release/mlx-server
69-
file .build/release/mlx-server
70-
.build/release/mlx-server --help || true
68+
ls -lh .build/release/SwiftLM
69+
file .build/release/SwiftLM
70+
.build/release/SwiftLM --help || true
7171
7272
- name: Package binary
7373
run: |
7474
mkdir -p release
75-
cp .build/release/mlx-server release/
75+
cp .build/release/SwiftLM release/
7676
cp LICENSE README.md release/
7777
cd release
78-
tar -czvf ../mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz .
78+
tar -czvf ../SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz .
7979
8080
- name: Upload artifact
8181
uses: actions/upload-artifact@v4
8282
with:
83-
name: mlx-server-${{ steps.tag.outputs.name }}-macos-arm64
84-
path: mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
83+
name: SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64
84+
path: SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
8585
retention-days: 90
8686

8787
- name: Prepare release notes
8888
id: notes
8989
run: |
9090
CHANGELOG=$(cat /tmp/changelog.txt)
9191
cat > /tmp/release_notes.md << 'RELEASE_EOF'
92-
## mlx-server ${{ steps.tag.outputs.full }}
92+
## SwiftLM ${{ steps.tag.outputs.full }}
9393
9494
<details open>
9595
@@ -105,25 +105,25 @@ jobs:
105105
106106
### Download
107107
108-
- [macOS Apple Silicon (arm64)](https://github.com/SharpAI/mlx-server/releases/download/${{ steps.tag.outputs.name }}/mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz)
108+
- [macOS Apple Silicon (arm64)](https://github.com/SharpAI/SwiftLM/releases/download/${{ steps.tag.outputs.name }}/SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz)
109109
110110
### Quick Start
111111
```bash
112-
tar -xzf mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
113-
./mlx-server --model mlx-community/Qwen2.5-3B-Instruct-4bit --port 5413
112+
tar -xzf SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
113+
./SwiftLM --model mlx-community/Qwen2.5-3B-Instruct-4bit --port 5413
114114
```
115115
116-
> **Note:** Requires `mlx.metallib` next to the binary for GPU compute. See [README](https://github.com/SharpAI/mlx-server#metal-shader-library) for setup.
116+
> **Note:** Requires `mlx.metallib` next to the binary for GPU compute. See [README](https://github.com/SharpAI/SwiftLM#metal-shader-library) for setup.
117117
RELEASE_EOF
118118
119119
- name: Create release
120120
if: ${{ github.event_name == 'push' || github.event.inputs.create_release == 'true' }}
121121
uses: softprops/action-gh-release@v2
122122
with:
123123
tag_name: ${{ steps.tag.outputs.name }}
124-
name: "mlx-server ${{ steps.tag.outputs.name }}"
124+
name: "SwiftLM ${{ steps.tag.outputs.name }}"
125125
body_path: /tmp/release_notes.md
126126
files: |
127-
mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
127+
SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
128128
draft: false
129129
prerelease: false

AEGIS_INTEGRATION.md

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,24 @@
11
# 🛡️ Aegis-AI Integration Guide
22

3-
`mlx-server` is designed to be a **completely transparent, drop-in replacement** for `llama-server` or any cloud VLM gateway within Aegis-AI, delivering dramatically faster zero-latency inference on Apple Silicon.
3+
`SwiftLM` is designed to be a **completely transparent, drop-in replacement** for `llama-server` or any cloud VLM gateway within Aegis-AI, delivering dramatically faster zero-latency inference on Apple Silicon.
44

55
---
66

77
## 🚀 Quick Start for Aegis-AI
88

99
### 1. Download the Binary
1010

11-
Download the latest pre-built binary from the [Releases page](https://github.com/SharpAI/mlx-server/releases) — no Xcode required:
11+
Download the latest pre-built binary from the [Releases page](https://github.com/SharpAI/SwiftLM/releases) — no Xcode required:
1212

1313
```bash
1414
# Extract and make executable
15-
tar -xzf mlx-server-*-macos-arm64.tar.gz
16-
chmod +x mlx-server
15+
tar -xzf SwiftLM-*-macos-arm64.tar.gz
16+
chmod +x SwiftLM
1717
```
1818

1919
### 2. Point Aegis-AI at the Server
2020

21-
In your `~/.aegis-ai/llm-config.json`, set the base URL to the mlx-server endpoint:
21+
In your `~/.aegis-ai/llm-config.json`, set the base URL to the SwiftLM endpoint:
2222

2323
```json
2424
{
@@ -30,10 +30,10 @@ In your `~/.aegis-ai/llm-config.json`, set the base URL to the mlx-server endpoi
3030

3131
### 3. Launch the Sidecar
3232

33-
Aegis-AI should spin up `mlx-server` as a managed subprocess:
33+
Aegis-AI should spin up `SwiftLM` as a managed subprocess:
3434

3535
```bash
36-
/path/to/mlx-server \
36+
/path/to/SwiftLM \
3737
--model mlx-community/Qwen2.5-7B-Instruct-4bit \
3838
--host 127.0.0.1 \
3939
--port 5413
@@ -54,7 +54,7 @@ Aegis-AI should **wait for this event** before routing any requests to the serve
5454
If you are running a Mixture of Experts (MoE) model — such as `Qwen3.5-122B-A10B` — you **must** pass the `--stream-experts true` flag.
5555

5656
```bash
57-
/path/to/mlx-server \
57+
/path/to/SwiftLM \
5858
--model mlx-community/Qwen3.5-122B-A10B-4bit \
5959
--host 127.0.0.1 \
6060
--port 5413 \
@@ -66,7 +66,7 @@ If you are running a Mixture of Experts (MoE) model — such as `Qwen3.5-122B-A1
6666
6767
### Why `--stream-experts` Works
6868

69-
MoE models like Qwen3.5-122B have 122B *total* parameters, but only ~10B are **active** on any single forward pass. `mlx-server` exploits this sparsity:
69+
MoE models like Qwen3.5-122B have 122B *total* parameters, but only ~10B are **active** on any single forward pass. `SwiftLM` exploits this sparsity:
7070

7171
- The 60GB+ of expert weight matrices are `mmap`'d directly from your NVMe SSD
7272
- Only the **2-4 specific expert shards** selected by the router for the current token (~1.5MB each) are streamed into GPU RAM via a zero-copy DMA path
@@ -85,13 +85,13 @@ Due to SSD streaming, TTFT is higher than a fully in-memory model. This is **exp
8585
| Long (1000+ tokens) | 1–3 minutes |
8686

8787
> [!TIP]
88-
> **Aegis-AI Prompt Cache**: `mlx-server` automatically caches the KV state for repeated system prompts. After the first request with a given system prompt, subsequent requests with the same system prompt will skip the expensive prefill phase and start streaming almost immediately.
88+
> **Aegis-AI Prompt Cache**: `SwiftLM` automatically caches the KV state for repeated system prompts. After the first request with a given system prompt, subsequent requests with the same system prompt will skip the expensive prefill phase and start streaming almost immediately.
8989
9090
---
9191

9292
## 📡 API Reference
9393

94-
`mlx-server` is **fully OpenAI-compatible** — any client using the OpenAI SDK works without modification.
94+
`SwiftLM` is **fully OpenAI-compatible** — any client using the OpenAI SDK works without modification.
9595

9696
### Endpoints
9797

@@ -174,7 +174,7 @@ curl http://127.0.0.1:5413/v1/chat/completions \
174174

175175
## 🔍 Memory Behaviour Explained
176176

177-
On Apple Silicon, GPU and system RAM are the **same physical chips** (Unified Memory Architecture). `mlx-server` uses a layered strategy to fit the largest possible models:
177+
On Apple Silicon, GPU and system RAM are the **same physical chips** (Unified Memory Architecture). `SwiftLM` uses a layered strategy to fit the largest possible models:
178178

179179
| Model Size vs. RAM | Strategy | Notes |
180180
|---|---|---|
@@ -186,7 +186,7 @@ On Apple Silicon, GPU and system RAM are the **same physical chips** (Unified Me
186186
You can always inspect the computed memory plan before loading a model:
187187

188188
```bash
189-
mlx-server --model mlx-community/Qwen3.5-122B-A10B-4bit --info
189+
SwiftLM --model mlx-community/Qwen3.5-122B-A10B-4bit --info
190190
```
191191

192192
---
@@ -202,5 +202,5 @@ mlx-server --model mlx-community/Qwen3.5-122B-A10B-4bit --info
202202
## 🔗 Resources
203203

204204
- [Main README](./README.md) — general usage and benchmarks
205-
- [GitHub Releases](https://github.com/SharpAI/mlx-server/releases) — pre-built binaries
205+
- [GitHub Releases](https://github.com/SharpAI/SwiftLM/releases) — pre-built binaries
206206
- [mlx-swift](https://github.com/ml-explore/mlx-swift) — underlying MLX framework

Package.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import PackageDescription
33

44
let package = Package(
5-
name: "mlx-server",
5+
name: "SwiftLM",
66
platforms: [.macOS(.v14)],
77
dependencies: [
88
// Local Apple MLX Swift fork for C++ extensions
@@ -18,7 +18,7 @@ let package = Package(
1818
],
1919
targets: [
2020
.executableTarget(
21-
name: "mlx-server",
21+
name: "SwiftLM",
2222
dependencies: [
2323
.product(name: "MLX", package: "mlx-swift"),
2424
.product(name: "MLXLLM", package: "mlx-swift-lm"),
@@ -28,7 +28,7 @@ let package = Package(
2828
.product(name: "Hummingbird", package: "hummingbird"),
2929
.product(name: "ArgumentParser", package: "swift-argument-parser"),
3030
],
31-
path: "Sources/mlx-server",
31+
path: "Sources/SwiftLM",
3232
swiftSettings: [
3333
.enableExperimentalFeature("StrictConcurrency")
3434
]

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# ⚡️ mlx-server
1+
# ⚡️ SwiftLM
22

33
A blazingly fast, native Swift inference server that serves [MLX](https://github.com/ml-explore/mlx) models with a strict **OpenAI-compatible API**.
44

@@ -17,7 +17,7 @@ No Python runtime, no Global Interpreter Lock (GIL), no unnecessary memory copie
1717

1818
## ⚡️ TurboQuantization: KV Cache Compression
1919

20-
`mlx-server` implements **TurboQuant** (AISTATS/ICLR 2026) for on-the-fly KV cache compression, enabling long-context inference with drastically reduced memory. At 3 bits/coordinate, the KV cache is compressed ~5.8× vs FP16 with near-zero accuracy loss.
20+
`SwiftLM` implements **TurboQuant** (AISTATS/ICLR 2026) for on-the-fly KV cache compression, enabling long-context inference with drastically reduced memory. At 3 bits/coordinate, the KV cache is compressed ~5.8× vs FP16 with near-zero accuracy loss.
2121

2222
The algorithm runs in two stages per KV vector:
2323

@@ -45,7 +45,7 @@ Reference implementation: [`turboquant_plus`](https://github.com/TheTom/turboqua
4545

4646
## 💻 Tested Hardware & Benchmarks
4747

48-
To reliably run massive 122B parameter MoE models over SSD streaming, `mlx-server` was designed and benchmarked natively on the following hardware:
48+
To reliably run massive 122B parameter MoE models over SSD streaming, `SwiftLM` was designed and benchmarked natively on the following hardware:
4949

5050
- **Machine**: MacBook Pro, Apple M5 Pro
5151
- **Memory**: 64 GB Unified Memory
@@ -59,7 +59,7 @@ To reliably run massive 122B parameter MoE models over SSD streaming, `mlx-serve
5959
## 🛠️ Quick Start
6060

6161
### Fastest: Download Pre-built Binary
62-
The absolute fastest way to get started is to [download the latest pre-compiled macOS binary](https://github.com/SharpAI/mlx-server/releases) directly from the Releases page. Just extract it and run!
62+
The absolute fastest way to get started is to [download the latest pre-compiled macOS binary](https://github.com/SharpAI/SwiftLM/releases) directly from the Releases page. Just extract it and run!
6363

6464
### Build from Source
6565

@@ -70,7 +70,7 @@ swift build -c release
7070
### Run (Downloads model natively on first launch)
7171

7272
```bash
73-
.build/release/mlx-server \
73+
.build/release/SwiftLM \
7474
--model Qwen3.5-122B-A10B-4bit \
7575
--stream-experts true \
7676
--port 5413
@@ -133,7 +133,7 @@ Built entirely on the hard work of the Apple MLX community.
133133

134134
### 🙏 TurboQuant Credits
135135

136-
The TurboQuant KV cache compression implemented in `mlx-server` is directly based on the following open-source work and research:
136+
The TurboQuant KV cache compression implemented in `SwiftLM` is directly based on the following open-source work and research:
137137

138138
- **[TheTom/llama-cpp-turboquant](https://github.com/TheTom/llama-cpp-turboquant/tree/feature/turboquant-kv-cache)** — The primary reference for the C and Metal GPU implementation. The `turbo-wht.h` Fast Walsh-Hadamard kernel, WHT sign arrays (seed=42), Lloyd-Max centroid tables, and the `ggml-turbo-quant.c` quantize/dequantize logic were ported directly from this repository into our MLX C++ and Metal backend.
139139

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
//
66
// On first run with a new model, the calibrator runs a short benchmark to find
77
// the optimal cache limit that maximizes tok/s. The result is stored in
8-
// ~/.mlx-server/wisdom/<hash>.json and loaded directly on future runs.
8+
// ~/.swiftlm/wisdom/<hash>.json and loaded directly on future runs.
99
//
1010
// Usage:
1111
// let wisdom = try await Calibrator.calibrate(container: container, plan: plan, profile: profile)
@@ -47,7 +47,7 @@ enum Calibrator {
4747
/// Directory for wisdom files
4848
private static var wisdomDirectory: URL {
4949
let home = FileManager.default.homeDirectoryForCurrentUser
50-
return home.appendingPathComponent(".mlx-server/wisdom")
50+
return home.appendingPathComponent(".swiftlm/wisdom")
5151
}
5252

5353
/// Hardware fingerprint: chip + memory + OS
@@ -90,7 +90,7 @@ enum Calibrator {
9090
decoder.dateDecodingStrategy = .iso8601
9191
return try decoder.decode(WisdomEntry.self, from: data)
9292
} catch {
93-
print("[mlx-server] ⚠️ Failed to load wisdom: \(error.localizedDescription)")
93+
print("[SwiftLM] ⚠️ Failed to load wisdom: \(error.localizedDescription)")
9494
return nil
9595
}
9696
}
@@ -121,7 +121,7 @@ enum Calibrator {
121121
contextSize: Int = 4096
122122
) async throws -> WisdomEntry {
123123
let startTime = Date()
124-
print("[mlx-server] 📊 Calibrating... (this only happens once per model × hardware)")
124+
print("[SwiftLM] 📊 Calibrating... (this only happens once per model × hardware)")
125125

126126
// Determine trial cache limits based on available memory
127127
let systemRAMBytes = Int(ProcessInfo.processInfo.physicalMemory)
@@ -155,7 +155,7 @@ enum Calibrator {
155155
let maxTokens = 30 // Just enough to measure steady-state decode speed
156156

157157
for (idx, trial) in trials.enumerated() {
158-
print("[mlx-server] Trial \(idx + 1)/\(trials.count): \(trial.label) (\(trial.cacheLimitBytes / (1024*1024))MB)")
158+
print("[SwiftLM] Trial \(idx + 1)/\(trials.count): \(trial.label) (\(trial.cacheLimitBytes / (1024*1024))MB)")
159159

160160
// Set cache limit for this trial
161161
if trial.cacheLimitBytes > 0 {
@@ -173,13 +173,13 @@ enum Calibrator {
173173
)
174174

175175
if let result = result {
176-
print("[mlx-server] → \(String(format: "%.1f", result.tokPerSec)) tok/s decode, \(String(format: "%.0f", result.ttftMs))ms TTFT")
176+
print("[SwiftLM] → \(String(format: "%.1f", result.tokPerSec)) tok/s decode, \(String(format: "%.0f", result.ttftMs))ms TTFT")
177177

178178
if bestTrial == nil || result.tokPerSec > bestTrial!.tokPerSec {
179179
bestTrial = (trial, result.tokPerSec, result.prefillTokPerSec, result.ttftMs)
180180
}
181181
} else {
182-
print("[mlx-server] → failed, skipping")
182+
print("[SwiftLM] → failed, skipping")
183183
}
184184
}
185185

@@ -209,9 +209,9 @@ enum Calibrator {
209209

210210
try saveWisdom(entry)
211211

212-
print("[mlx-server] 📊 Calibration complete in \(String(format: "%.1f", elapsed))s")
213-
print("[mlx-server] Winner: \(best.trial.label)\(String(format: "%.1f", best.tokPerSec)) tok/s")
214-
print("[mlx-server] Saved to ~/.mlx-server/wisdom/")
212+
print("[SwiftLM] 📊 Calibration complete in \(String(format: "%.1f", elapsed))s")
213+
print("[SwiftLM] Winner: \(best.trial.label)\(String(format: "%.1f", best.tokPerSec)) tok/s")
214+
print("[SwiftLM] Saved to ~/.swiftlm/wisdom/")
215215

216216
return entry
217217
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ enum ModelProfiler {
430430
let thinSep = String(repeating: "", count: 56)
431431

432432
print("\(separator)")
433-
print("mlx-server Model Memory Analysis\(String(repeating: " ", count: 25))")
433+
print("SwiftLM Model Memory Analysis\(String(repeating: " ", count: 25))")
434434
print("\(separator)")
435435

436436
// Model info

0 commit comments

Comments
 (0)