refactor: rename project from mlx-server to SwiftLM

solderzzc · solderzzc · commit 480e349191c2 · 2026-03-30T12:28:01.000-07:00
- Rename Sources/mlx-server/ → Sources/SwiftLM/
- Update Package.swift: package name, target name, source path
- Update all [mlx-server] log prefixes to [SwiftLM]
- Update ~/.mlx-server/wisdom/ path to ~/.swiftlm/wisdom/
- Update CLI commandName to SwiftLM
- Update GitHub Actions workflows: binary path, tarball names, release titles
- Update all documentation files
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -33,8 +33,8 @@ jobs:
 
       - name: Verify binary
         run: |
-          ls -lh .build/release/mlx-server
-          file .build/release/mlx-server
+          ls -lh .build/release/SwiftLM
+          file .build/release/SwiftLM
 
       - name: TurboQuant unit tests
         run: |
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
@@ -60,7 +60,7 @@ jobs:
           # Retry up to 2 times for transient HuggingFace download failures
           for attempt in 1 2 3; do
             echo "Attempt $attempt of 3..."
-            if tests/test-server.sh .build/release/mlx-server 15413; then
+            if tests/test-server.sh .build/release/SwiftLM 15413; then
               exit 0
             fi
             if [ "$attempt" -lt 3 ]; then
@@ -76,5 +76,5 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: e2e-test-logs
-          path: /tmp/mlx-server-test-*.log
+          path: /tmp/SwiftLM-test-*.log
           retention-days: 7
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -65,31 +65,31 @@ jobs:
 
       - name: Verify binary
         run: |
-          ls -lh .build/release/mlx-server
-          file .build/release/mlx-server
-          .build/release/mlx-server --help || true
+          ls -lh .build/release/SwiftLM
+          file .build/release/SwiftLM
+          .build/release/SwiftLM --help || true
 
       - name: Package binary
         run: |
           mkdir -p release
-          cp .build/release/mlx-server release/
+          cp .build/release/SwiftLM release/
           cp LICENSE README.md release/
           cd release
-          tar -czvf ../mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz .
+          tar -czvf ../SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz .
 
       - name: Upload artifact
         uses: actions/upload-artifact@v4
         with:
-          name: mlx-server-${{ steps.tag.outputs.name }}-macos-arm64
-          path: mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
+          name: SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64
+          path: SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
           retention-days: 90
 
       - name: Prepare release notes
         id: notes
         run: |
           CHANGELOG=$(cat /tmp/changelog.txt)
           cat > /tmp/release_notes.md << 'RELEASE_EOF'
-          ## mlx-server ${{ steps.tag.outputs.full }}
+          ## SwiftLM ${{ steps.tag.outputs.full }}
 
           <details open>
 
@@ -105,25 +105,25 @@ jobs:
 
           ### Download
 
-          - [macOS Apple Silicon (arm64)](https://github.com/SharpAI/mlx-server/releases/download/${{ steps.tag.outputs.name }}/mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz)
+          - [macOS Apple Silicon (arm64)](https://github.com/SharpAI/SwiftLM/releases/download/${{ steps.tag.outputs.name }}/SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz)
 
           ### Quick Start
           ```bash
-          tar -xzf mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
-          ./mlx-server --model mlx-community/Qwen2.5-3B-Instruct-4bit --port 5413
+          tar -xzf SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
+          ./SwiftLM --model mlx-community/Qwen2.5-3B-Instruct-4bit --port 5413
           ```
 
-          > **Note:** Requires `mlx.metallib` next to the binary for GPU compute. See [README](https://github.com/SharpAI/mlx-server#metal-shader-library) for setup.
+          > **Note:** Requires `mlx.metallib` next to the binary for GPU compute. See [README](https://github.com/SharpAI/SwiftLM#metal-shader-library) for setup.
           RELEASE_EOF
 
       - name: Create release
         if: ${{ github.event_name == 'push' || github.event.inputs.create_release == 'true' }}
         uses: softprops/action-gh-release@v2
         with:
           tag_name: ${{ steps.tag.outputs.name }}
-          name: "mlx-server ${{ steps.tag.outputs.name }}"
+          name: "SwiftLM ${{ steps.tag.outputs.name }}"
           body_path: /tmp/release_notes.md
           files: |
-            mlx-server-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
+            SwiftLM-${{ steps.tag.outputs.name }}-macos-arm64.tar.gz
           draft: false
           prerelease: false
diff --git a/AEGIS_INTEGRATION.md b/AEGIS_INTEGRATION.md
@@ -1,24 +1,24 @@
 # 🛡️ Aegis-AI Integration Guide
 
-`mlx-server` is designed to be a **completely transparent, drop-in replacement** for `llama-server` or any cloud VLM gateway within Aegis-AI, delivering dramatically faster zero-latency inference on Apple Silicon.
+`SwiftLM` is designed to be a **completely transparent, drop-in replacement** for `llama-server` or any cloud VLM gateway within Aegis-AI, delivering dramatically faster zero-latency inference on Apple Silicon.
 
 ---
 
 ## 🚀 Quick Start for Aegis-AI
 
 ### 1. Download the Binary
 
-Download the latest pre-built binary from the [Releases page](https://github.com/SharpAI/mlx-server/releases) — no Xcode required:
+Download the latest pre-built binary from the [Releases page](https://github.com/SharpAI/SwiftLM/releases) — no Xcode required:
 
 ```bash
 # Extract and make executable
-tar -xzf mlx-server-*-macos-arm64.tar.gz
-chmod +x mlx-server
+tar -xzf SwiftLM-*-macos-arm64.tar.gz
+chmod +x SwiftLM
 ```
 
 ### 2. Point Aegis-AI at the Server
 
-In your `~/.aegis-ai/llm-config.json`, set the base URL to the mlx-server endpoint:
+In your `~/.aegis-ai/llm-config.json`, set the base URL to the SwiftLM endpoint:
 
 ```json
 {
@@ -30,10 +30,10 @@ In your `~/.aegis-ai/llm-config.json`, set the base URL to the mlx-server endpoi
 
 ### 3. Launch the Sidecar
 
-Aegis-AI should spin up `mlx-server` as a managed subprocess:
+Aegis-AI should spin up `SwiftLM` as a managed subprocess:
 
 ```bash
-/path/to/mlx-server \
+/path/to/SwiftLM \
   --model mlx-community/Qwen2.5-7B-Instruct-4bit \
   --host 127.0.0.1 \
   --port 5413
@@ -54,7 +54,7 @@ Aegis-AI should **wait for this event** before routing any requests to the serve
 If you are running a Mixture of Experts (MoE) model — such as `Qwen3.5-122B-A10B` — you **must** pass the `--stream-experts true` flag.
 
 ```bash
-/path/to/mlx-server \
+/path/to/SwiftLM \
   --model mlx-community/Qwen3.5-122B-A10B-4bit \
   --host 127.0.0.1 \
   --port 5413 \
@@ -66,7 +66,7 @@ If you are running a Mixture of Experts (MoE) model — such as `Qwen3.5-122B-A1
 
 ### Why `--stream-experts` Works
 
-MoE models like Qwen3.5-122B have 122B *total* parameters, but only ~10B are **active** on any single forward pass. `mlx-server` exploits this sparsity:
+MoE models like Qwen3.5-122B have 122B *total* parameters, but only ~10B are **active** on any single forward pass. `SwiftLM` exploits this sparsity:
 
 - The 60GB+ of expert weight matrices are `mmap`'d directly from your NVMe SSD
 - Only the **2-4 specific expert shards** selected by the router for the current token (~1.5MB each) are streamed into GPU RAM via a zero-copy DMA path
@@ -85,13 +85,13 @@ Due to SSD streaming, TTFT is higher than a fully in-memory model. This is **exp
 | Long (1000+ tokens) | 1–3 minutes |
 
 > [!TIP]
-> **Aegis-AI Prompt Cache**: `mlx-server` automatically caches the KV state for repeated system prompts. After the first request with a given system prompt, subsequent requests with the same system prompt will skip the expensive prefill phase and start streaming almost immediately.
+> **Aegis-AI Prompt Cache**: `SwiftLM` automatically caches the KV state for repeated system prompts. After the first request with a given system prompt, subsequent requests with the same system prompt will skip the expensive prefill phase and start streaming almost immediately.
 
 ---
 
 ## 📡 API Reference
 
-`mlx-server` is **fully OpenAI-compatible** — any client using the OpenAI SDK works without modification.
+`SwiftLM` is **fully OpenAI-compatible** — any client using the OpenAI SDK works without modification.
 
 ### Endpoints
 
@@ -174,7 +174,7 @@ curl http://127.0.0.1:5413/v1/chat/completions \
 
 ## 🔍 Memory Behaviour Explained
 
-On Apple Silicon, GPU and system RAM are the **same physical chips** (Unified Memory Architecture). `mlx-server` uses a layered strategy to fit the largest possible models:
+On Apple Silicon, GPU and system RAM are the **same physical chips** (Unified Memory Architecture). `SwiftLM` uses a layered strategy to fit the largest possible models:
 
 | Model Size vs. RAM | Strategy | Notes |
 |---|---|---|
@@ -186,7 +186,7 @@ On Apple Silicon, GPU and system RAM are the **same physical chips** (Unified Me
 You can always inspect the computed memory plan before loading a model:
 
 ```bash
-mlx-server --model mlx-community/Qwen3.5-122B-A10B-4bit --info
+SwiftLM --model mlx-community/Qwen3.5-122B-A10B-4bit --info
 ```
 
 ---
@@ -202,5 +202,5 @@ mlx-server --model mlx-community/Qwen3.5-122B-A10B-4bit --info
 ## 🔗 Resources
 
 - [Main README](./README.md) — general usage and benchmarks
-- [GitHub Releases](https://github.com/SharpAI/mlx-server/releases) — pre-built binaries
+- [GitHub Releases](https://github.com/SharpAI/SwiftLM/releases) — pre-built binaries
 - [mlx-swift](https://github.com/ml-explore/mlx-swift) — underlying MLX framework
diff --git a/Package.swift b/Package.swift
@@ -2,7 +2,7 @@
 import PackageDescription
 
 let package = Package(
-    name: "mlx-server",
+    name: "SwiftLM",
     platforms: [.macOS(.v14)],
     dependencies: [
         // Local Apple MLX Swift fork for C++ extensions
@@ -18,7 +18,7 @@ let package = Package(
     ],
     targets: [
         .executableTarget(
-            name: "mlx-server",
+            name: "SwiftLM",
             dependencies: [
                 .product(name: "MLX", package: "mlx-swift"),
                 .product(name: "MLXLLM", package: "mlx-swift-lm"),
@@ -28,7 +28,7 @@ let package = Package(
                 .product(name: "Hummingbird", package: "hummingbird"),
                 .product(name: "ArgumentParser", package: "swift-argument-parser"),
             ],
-            path: "Sources/mlx-server",
+            path: "Sources/SwiftLM",
             swiftSettings: [
                 .enableExperimentalFeature("StrictConcurrency")
             ]
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# ⚡️ mlx-server
+# ⚡️ SwiftLM
 
 A blazingly fast, native Swift inference server that serves [MLX](https://github.com/ml-explore/mlx) models with a strict **OpenAI-compatible API**. 
 
@@ -17,7 +17,7 @@ No Python runtime, no Global Interpreter Lock (GIL), no unnecessary memory copie
 
 ## ⚡️ TurboQuantization: KV Cache Compression
 
-`mlx-server` implements **TurboQuant** (AISTATS/ICLR 2026) for on-the-fly KV cache compression, enabling long-context inference with drastically reduced memory. At 3 bits/coordinate, the KV cache is compressed ~5.8× vs FP16 with near-zero accuracy loss.
+`SwiftLM` implements **TurboQuant** (AISTATS/ICLR 2026) for on-the-fly KV cache compression, enabling long-context inference with drastically reduced memory. At 3 bits/coordinate, the KV cache is compressed ~5.8× vs FP16 with near-zero accuracy loss.
 
 The algorithm runs in two stages per KV vector:
 
@@ -45,7 +45,7 @@ Reference implementation: [`turboquant_plus`](https://github.com/TheTom/turboqua
 
 ## 💻 Tested Hardware & Benchmarks
 
-To reliably run massive 122B parameter MoE models over SSD streaming, `mlx-server` was designed and benchmarked natively on the following hardware:
+To reliably run massive 122B parameter MoE models over SSD streaming, `SwiftLM` was designed and benchmarked natively on the following hardware:
 
 - **Machine**: MacBook Pro, Apple M5 Pro
 - **Memory**: 64 GB Unified Memory
@@ -59,7 +59,7 @@ To reliably run massive 122B parameter MoE models over SSD streaming, `mlx-serve
 ## 🛠️ Quick Start
 
 ### Fastest: Download Pre-built Binary
-The absolute fastest way to get started is to [download the latest pre-compiled macOS binary](https://github.com/SharpAI/mlx-server/releases) directly from the Releases page. Just extract it and run!
+The absolute fastest way to get started is to [download the latest pre-compiled macOS binary](https://github.com/SharpAI/SwiftLM/releases) directly from the Releases page. Just extract it and run!
 
 ### Build from Source
 
@@ -70,7 +70,7 @@ swift build -c release
 ### Run (Downloads model natively on first launch)
 
 ```bash
-.build/release/mlx-server \
+.build/release/SwiftLM \
   --model Qwen3.5-122B-A10B-4bit \
   --stream-experts true \
   --port 5413
@@ -133,7 +133,7 @@ Built entirely on the hard work of the Apple MLX community.
 
 ### 🙏 TurboQuant Credits
 
-The TurboQuant KV cache compression implemented in `mlx-server` is directly based on the following open-source work and research:
+The TurboQuant KV cache compression implemented in `SwiftLM` is directly based on the following open-source work and research:
 
 - **[TheTom/llama-cpp-turboquant](https://github.com/TheTom/llama-cpp-turboquant/tree/feature/turboquant-kv-cache)** — The primary reference for the C and Metal GPU implementation. The `turbo-wht.h` Fast Walsh-Hadamard kernel, WHT sign arrays (seed=42), Lloyd-Max centroid tables, and the `ggml-turbo-quant.c` quantize/dequantize logic were ported directly from this repository into our MLX C++ and Metal backend.
 
diff --git a/Sources/SwiftLM/Calibrator.swift b/Sources/SwiftLM/Calibrator.swift
@@ -5,7 +5,7 @@
 //
 // On first run with a new model, the calibrator runs a short benchmark to find
 // the optimal cache limit that maximizes tok/s. The result is stored in
-// ~/.mlx-server/wisdom/<hash>.json and loaded directly on future runs.
+// ~/.swiftlm/wisdom/<hash>.json and loaded directly on future runs.
 //
 // Usage:
 //   let wisdom = try await Calibrator.calibrate(container: container, plan: plan, profile: profile)
@@ -47,7 +47,7 @@ enum Calibrator {
     /// Directory for wisdom files
     private static var wisdomDirectory: URL {
         let home = FileManager.default.homeDirectoryForCurrentUser
-        return home.appendingPathComponent(".mlx-server/wisdom")
+        return home.appendingPathComponent(".swiftlm/wisdom")
     }
     
     /// Hardware fingerprint: chip + memory + OS
@@ -90,7 +90,7 @@ enum Calibrator {
             decoder.dateDecodingStrategy = .iso8601
             return try decoder.decode(WisdomEntry.self, from: data)
         } catch {
-            print("[mlx-server] ⚠️  Failed to load wisdom: \(error.localizedDescription)")
+            print("[SwiftLM] ⚠️  Failed to load wisdom: \(error.localizedDescription)")
             return nil
         }
     }
@@ -121,7 +121,7 @@ enum Calibrator {
         contextSize: Int = 4096
     ) async throws -> WisdomEntry {
         let startTime = Date()
-        print("[mlx-server] 📊 Calibrating... (this only happens once per model × hardware)")
+        print("[SwiftLM] 📊 Calibrating... (this only happens once per model × hardware)")
         
         // Determine trial cache limits based on available memory
         let systemRAMBytes = Int(ProcessInfo.processInfo.physicalMemory)
@@ -155,7 +155,7 @@ enum Calibrator {
         let maxTokens = 30  // Just enough to measure steady-state decode speed
         
         for (idx, trial) in trials.enumerated() {
-            print("[mlx-server]   Trial \(idx + 1)/\(trials.count): \(trial.label) (\(trial.cacheLimitBytes / (1024*1024))MB)")
+            print("[SwiftLM]   Trial \(idx + 1)/\(trials.count): \(trial.label) (\(trial.cacheLimitBytes / (1024*1024))MB)")
             
             // Set cache limit for this trial
             if trial.cacheLimitBytes > 0 {
@@ -173,13 +173,13 @@ enum Calibrator {
             )
             
             if let result = result {
-                print("[mlx-server]     → \(String(format: "%.1f", result.tokPerSec)) tok/s decode, \(String(format: "%.0f", result.ttftMs))ms TTFT")
+                print("[SwiftLM]     → \(String(format: "%.1f", result.tokPerSec)) tok/s decode, \(String(format: "%.0f", result.ttftMs))ms TTFT")
                 
                 if bestTrial == nil || result.tokPerSec > bestTrial!.tokPerSec {
                     bestTrial = (trial, result.tokPerSec, result.prefillTokPerSec, result.ttftMs)
                 }
             } else {
-                print("[mlx-server]     → failed, skipping")
+                print("[SwiftLM]     → failed, skipping")
             }
         }
         
@@ -209,9 +209,9 @@ enum Calibrator {
         
         try saveWisdom(entry)
         
-        print("[mlx-server] 📊 Calibration complete in \(String(format: "%.1f", elapsed))s")
-        print("[mlx-server]    Winner: \(best.trial.label) → \(String(format: "%.1f", best.tokPerSec)) tok/s")
-        print("[mlx-server]    Saved to ~/.mlx-server/wisdom/")
+        print("[SwiftLM] 📊 Calibration complete in \(String(format: "%.1f", elapsed))s")
+        print("[SwiftLM]    Winner: \(best.trial.label) → \(String(format: "%.1f", best.tokPerSec)) tok/s")
+        print("[SwiftLM]    Saved to ~/.swiftlm/wisdom/")
         
         return entry
     }
diff --git a/Sources/SwiftLM/ModelProfiler.swift b/Sources/SwiftLM/ModelProfiler.swift
@@ -430,7 +430,7 @@ enum ModelProfiler {
         let thinSep = String(repeating: "─", count: 56)
 
         print("╔\(separator)╗")
-        print("║  mlx-server Model Memory Analysis\(String(repeating: " ", count: 25))║")
+        print("║  SwiftLM Model Memory Analysis\(String(repeating: " ", count: 25))║")
         print("╠\(separator)╣")
 
         // Model info
diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
diff --git a/docs/api-response-formats.md b/docs/api-response-formats.md
diff --git a/mlx_integration_prompt.md b/mlx_integration_prompt.md