quiet-node
diff --git a/‎docs/configurations.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/configurations.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/tuning-context-window.md‎
Lines changed: 141 additions & 0 deletions b/‎docs/tuning-context-window.md‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎src-tauri/Cargo.lock‎
Lines changed: 2 additions & 0 deletions b/‎src-tauri/Cargo.lock‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src-tauri/Cargo.toml‎
Lines changed: 2 additions & 0 deletions b/‎src-tauri/Cargo.toml‎
Lines changed: 2 additions & 0 deletions
@@ -31,6 +31,15 @@ open ~/Library/Application\ Support/com.quietnode.thuki/config.toml
 # selected from the in-app picker (which lists whatever is installed in
 # Ollama via /api/tags) and is stored in Thuki's local database, not here.
 ollama_url = "http://127.0.0.1:11434"
+# Minutes of inactivity before Thuki tells Ollama to release the model.
+# 0 = let Ollama manage (its own 5-minute default applies).
+# -1 = never release (keep loaded until Ollama itself exits or you unload manually).
+keep_warm_inactivity_minutes = 0
+# Context window size in tokens sent to Ollama with every request.
+# Warmup and chat share this value so Ollama reuses the same runner and its
+# cached KV prefix for the system prompt. Raise to fit longer conversations;
+# lower to reduce GPU memory use. Valid range: 2048–1048576.
+num_ctx = 16384
 
 [prompt]
 # Leave empty to use the built-in secretary persona.
@@ -88,6 +97,8 @@ When no model is installed and no choice has been persisted, Thuki refuses to di
 | Constant     | Default                    | Tunable? | Why not tunable | Bounds        | Description                                                                                                                                                                                                          |
 | :----------- | :------------------------- | :------- | :-------------- | :------------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `ollama_url` | `"http://127.0.0.1:11434"` | Yes      | —               | non-empty URL | The web address where Thuki finds your local Ollama server. The default works if you run Ollama on this machine with its standard port. Change this only if you moved Ollama to a different port or another machine. |
+| `keep_warm_inactivity_minutes` | `0` | Yes | — | `-1` or `[0, 1440]` | Minutes of inactivity before Thuki tells Ollama to release the model from VRAM. `0` means do not manage: Ollama's own 5-minute default applies. `-1` means never release (stays until Ollama exits or you unload manually). Raise for longer sessions between uses; lower to reclaim VRAM sooner. |
+| `num_ctx` | `16384` | Yes | — | `[2048, 1048576]` | Context window size in tokens sent to Ollama with every request. Warmup and chat share this value so Ollama reuses the same runner instance and its cached KV prefix for the system prompt: they must match or Ollama creates a second runner and the warmup saves nothing. Ollama silently clamps this to the model's physical maximum, so values above the model's capacity are accepted but have no extra effect. Raise to fit longer conversations without the model forgetting early messages: each doubling roughly doubles VRAM for the KV cache; lower to reclaim GPU memory at the cost of a shorter effective history. 16384 is the default because it comfortably holds the full system prompt (~4000 tokens) plus many turns while staying within 8 GB GPU budgets. See [Tuning the Context Window](./tuning-context-window.md) for a 5-minute benchmark recipe to find the right value for your hardware. |
 
 If the active model has been removed from Ollama between launches, Thuki silently falls back to the first installed model the next time you open the picker. If no models are installed at all, the next request surfaces a "Model not found" error with the exact `ollama pull <name>` command to run.
 
@@ -100,6 +111,7 @@ The table below also lists the baked-in safety limits that govern Thuki's commun
 | `MAX_OLLAMA_TAGS_BODY_BYTES`                | `4 MiB`  | No       | Defense-in-depth bound on attacker-controlled response body. A misbehaving or compromised Ollama could otherwise stream an unbounded payload and exhaust memory.        | —      | The largest `/api/tags` response body Thuki will accept. 4 MiB fits thousands of model entries; anything larger is rejected immediately and the request returns an error.            |
 | `MAX_OLLAMA_SHOW_BODY_BYTES`                | `4 MiB`  | No       | Defense-in-depth bound on attacker-controlled response body. Same rationale as `MAX_OLLAMA_TAGS_BODY_BYTES`.                                                            | —      | The largest `/api/show` response body Thuki will accept. Full Modelfiles and parameters can be sizable, but 4 MiB is well above any real model; larger responses are rejected.      |
 | `MAX_MODEL_SLUG_LEN`                        | `256 B`  | No       | Defense-in-depth bound on adversarial input. Real Ollama slugs are a handful of characters; capping the length stops malformed values long before any network or DB work. | —      | The longest model slug Thuki will accept from `set_active_model`. Anything longer is rejected immediately by `validate_model_slug`.                                                  |
+| `VRAM_POLL_INTERVAL_SECS`                   | `5 s`    | No       | Tuning this trades responsiveness against localhost polling load; 5 s is the sweet spot for loopback calls and matches Ollama's internal TTL resolution granularity. | —      | How often Thuki polls Ollama's `/api/ps` to detect VRAM changes made outside Thuki (for example, running `ollama stop` or a TTL expiry). The Settings panel VRAM indicator reflects these changes within one interval. |
 
 ### `[prompt]`
 
 
@@ -0,0 +1,141 @@
+# Tuning the Context Window for Your Mac
+
+The Context Window slider in Settings goes up to 1 M tokens, but the value that's actually _good_ for your machine depends on your GPU memory and the model you picked. This guide explains what those numbers mean and walks you through finding your sweet spot in about 5 minutes.
+
+> macOS only. Thuki is a Mac app and the steps below assume Apple Silicon (M1/M2/M3/M4/M5).
+
+## Quick vocabulary
+
+A few terms you'll see in this doc and in tools like `ollama ps`:
+
+- **Model weights**: the trained "knowledge" of the model. Fixed size; does not change with your settings. Loaded into memory once.
+- **Token**: a chunk of text, roughly ¾ of a word. "Context window in tokens" means the model can see that many word-chunks at once.
+- **Context window (`num_ctx`)**: how many tokens the model can see in a single conversation. Bigger window means more conversation history visible to the model.
+- **KV cache**: scratch space the model uses to remember the conversation while generating. Grows with the context window. **Doubling `num_ctx` roughly doubles the KV cache.** Model weights stay the same size.
+- **GPU**: the chip that runs the math. On Apple Silicon Macs, the GPU is built into the same chip as the CPU.
+- **VRAM / "GPU memory"**: the memory the GPU can read directly. On Apple Silicon this is _unified memory_, shared with the CPU; there is no separate VRAM chip. So when we say "Ollama is using 7 GiB of VRAM", we mean it is holding 7 GiB of your unified memory and the GPU has direct access to it.
+- **Cold load**: the few seconds it takes to read the model from disk into memory the first time you use it.
+- **Keep Warm**: tells Ollama to leave the model in memory after a reply, so the next message skips the cold load.
+
+## What Ollama does behind the scenes
+
+- When you send your first message, Ollama reads your selected model from disk into unified memory. This is the cold load.
+- It also allocates the KV cache based on `num_ctx`. A bigger context means a bigger allocation.
+- After the reply, Ollama keeps the model in memory for **5 minutes by default** (the `keep_alive` setting), then unloads it. The next request after that pays the cold load again.
+- If you set a `num_ctx` larger than the model can actually handle, Ollama silently caps it. Example: you set 1 M, the model maxes out at 128 K, so Ollama uses 128 K. No error, just clamped down.
+- If the requested memory exceeds what's available on the GPU, Ollama puts part of the model on the CPU instead. **This is the slow path** and is what we want to avoid.
+
+## The three signals to watch
+
+These are the only signals you need to decide whether your settings are healthy:
+
+1. **Is the model 100% on GPU?** Most important. CPU spill makes inference 5-20× slower.
+2. **Is system Memory Pressure green?** Leaves headroom for macOS and your other apps.
+3. **Does the GPU actually fire when you generate?** Sanity check that the model is doing real work.
+
+## The 5-minute benchmark recipe
+
+### Step 1 — Pick a starting value
+
+Open Thuki **Settings → Context Window**. Set the slider to **16384** (16K). This is the default and works on most Macs.
+
+### Step 2 — Open Activity Monitor's Memory tab
+
+1. Press `Cmd + Space`, type **Activity Monitor**, press Enter.
+2. Click the **Memory** tab at the top of the window.
+3. Look at the bottom of the window for the **Memory Pressure** graph (the colored graph in the lower-left). Green is good.
+4. Leave this window visible.
+
+<!-- screenshot: Activity Monitor → Memory tab with the Memory Pressure graph circled -->
+
+### Step 3 — Open the GPU History window
+
+GPU History is a separate floating window inside Activity Monitor. To open it:
+
+1. With Activity Monitor focused (click anywhere inside its window first), look at the macOS menu bar at the very top of your screen.
+2. Click **Window** in the menu bar (between "View" and "Help").
+3. From the dropdown, click **GPU History** (keyboard shortcut: `Cmd + 4`).
+4. A small floating window appears showing live GPU activity bars. Drag it next to Activity Monitor.
+
+> If you don't see a "Window" menu in the menu bar, click anywhere in the Activity Monitor window first to focus it, then look at the menu bar again.
+
+### Step 4 — Open Terminal
+
+1. Press `Cmd + Space`, type **Terminal** (or use your favorite terminal emulator), press Enter.
+2. Place it next to the other two windows.
+
+### Step 5 — Send a test message
+
+Open Thuki and send your usual kind of question, or paste a long block of text and ask about it. While the reply streams, watch:
+
+- The **GPU History** bars should spike high.
+- **Memory Pressure** should stay green.
+
+### Step 6 — Check what Ollama actually did
+
+While the reply is on screen (or right after), run in Terminal:
+
+```bash
+ollama ps
+```
+
+You'll see something like:
+
+```
+NAME         ID            SIZE     PROCESSOR    CONTEXT    UNTIL
+gemma4:e2b   7fbdbf8f5e45  7.4 GB   100% GPU     16384      4 minutes from now
+```
+
+What to read:
+
+- `PROCESSOR` must read **`100% GPU`**. If it shows `47%/53% CPU/GPU` (or any split), the model spilled out of unified memory. Too much context for your hardware.
+- `SIZE` is the total footprint right now (model weights + KV cache).
+- `CONTEXT` shows the actual context length Ollama used, after any silent clamping to your model's trained max.
+- `UNTIL` shows when Keep Warm will release the model.
+
+Note the SIZE value. You'll compare it against the next try.
+
+### Step 7 — Bump the context and repeat
+
+Go back to Thuki Settings and double the value (16K → 32K → 64K → ...). Send another test message. Re-run `ollama ps`.
+
+Stop the moment **any** of these happens:
+
+- `PROCESSOR` drops below 100% GPU, **or**
+- Memory Pressure turns yellow or red, **or**
+- Replies feel sluggish.
+
+### Step 8 — Lock in your sweet spot
+
+Set Thuki to one tier _below_ your last working value for safety margin. Example: 64K worked but `SIZE` was tight against your unified memory total → use 32K. You're done.
+
+## Picking Keep Warm
+
+Keep Warm is the second knob in the same Settings section. It tells Ollama how long to leave the model in memory between messages.
+
+- **`0`** — let Ollama use its 5-minute default. Good baseline.
+- **5 to 30 minutes** — good if you use Thuki in bursts every few minutes.
+- **`-1`** — always loaded. Only choose this if you have memory headroom and want zero cold-start ever.
+- **Unload now** — manual eject when you're done for the day.
+
+## Common results explained
+
+- **"GPU is 0% when Thuki is idle."** Normal. Keep Warm holds the model in memory, but the GPU only fires during generation. Memory residency and active compute are different things.
+- **"CPU stays low even during a reply."** Normal. Metal runs the math on the GPU; the CPU only orchestrates.
+- **"I set 1 M but `ollama ps` shows 128 K."** Normal. Ollama caps at the model's trained max and silently clamps down.
+- **"Model unloads on its own."** Either your Keep Warm timer expired or something else (you, or another tool) ran `ollama stop`.
+- **"Inference suddenly got slow."** Check `ollama ps` for a `CPU/GPU` split. You've spilled out of unified memory. Lower `num_ctx` or pick a smaller model.
+
+## Going deeper
+
+If you want raw machine-readable numbers, the same data plus a few extra fields is available from the Ollama HTTP API:
+
+```bash
+curl -s http://127.0.0.1:11434/api/ps | jq
+```
+
+Useful extra fields not shown by `ollama ps`:
+
+- `size_vram` — bytes the GPU is actually addressing (vs `size`, which includes any CPU portion when the model spilled).
+- `expires_at` — exact ISO timestamp when Keep Warm will release the model.
+- `digest` — content hash of the loaded model file.
@@ -45,6 +45,8 @@ async-trait = "0.1"
 tauri-nspanel = { git = "https://github.com/ahkohd/tauri-nspanel", branch = "v2.1" }
 core-graphics = "0.25"
 core-foundation = "0.10"
+objc2 = "0.6"
+objc2-app-kit = { version = "0.3", features = ["NSApplication", "NSRunningApplication"] }
 
 [dev-dependencies]
 mockito = "1"