switch to using host ollama so models run on host GPU

codeafix · codeafix · commit 763195e01c0a · 2026-04-12T17:08:45.000+01:00
diff --git a/Makefile b/Makefile
@@ -1,4 +1,19 @@
-.PHONY: up down logs logs-watcher pull reindex reindex-scan reindex-files reindex-status debug-retrieve debug-retrieve-dated parse-dates ask ask-stream chat shell check ps restart machine-start machine-init test-install test
+# ── model configuration ───────────────────────────────────────────────────────
+# These are the Ollama model names used by both the running stack and the
+# bootstrap target.  Override on the command line to switch models without
+# editing any file:
+#
+#   make ollama-bootstrap GENERATOR_MODEL=llama3.2:latest
+#   make up              GENERATOR_MODEL=llama3.2:latest
+#
+# The values are exported so podman compose inherits them as environment
+# variables, and docker-compose.yml references them as ${GENERATOR_MODEL} /
+# ${EMBED_MODEL} (with the same defaults as fallback for direct compose runs).
+GENERATOR_MODEL ?= gemma4-26b-q4xl:latest
+EMBED_MODEL     ?= nomic-embed-text
+export GENERATOR_MODEL EMBED_MODEL
+
+.PHONY: up down logs logs-watcher ollama-bootstrap ollama-status reindex reindex-scan reindex-files reindex-status debug-retrieve debug-retrieve-dated parse-dates ask ask-stream chat shell check ps restart machine-start machine-init test-install test
 
 up:
 	podman compose -f docker-compose.yml up -d --build
@@ -12,16 +27,33 @@ logs:
 logs-watcher:
 	podman logs -f markdown-rag-watcher
 
-pull:
-	# Ensure services are up so env vars are available
-	podman compose -f docker-compose.yml up -d ollama rag
-	# Use the rag container's env (GENERATOR_MODEL, EMBED_MODEL, OLLAMA_BASE_URL)
-	podman exec -it markdown-rag bash -lc ' \
-	  echo "Pulling $$GENERATOR_MODEL via $$OLLAMA_BASE_URL"; \
-	  curl -s -X POST "$${OLLAMA_BASE_URL}/api/pull" -d "{\"name\":\"$${GENERATOR_MODEL}\"}" >/dev/null || true; \
-	  echo "Pulling $$EMBED_MODEL via $$OLLAMA_BASE_URL"; \
-	  curl -s -X POST "$${OLLAMA_BASE_URL}/api/pull" -d "{\"name\":\"$${EMBED_MODEL}\"}" >/dev/null || true \
-	'
+ollama-bootstrap:
+	# Verify host Ollama is reachable before attempting model pulls.
+	@curl -sf http://localhost:11434/api/version >/dev/null || \
+	  { echo "ERROR: Ollama not reachable at localhost:11434."; \
+	    echo "       Start it with: ollama serve"; exit 1; }
+	# Pull models via the Ollama CLI on the host.  'ollama pull' is idempotent:
+	# it checks the local digest against the registry and skips the download if
+	# the model is already current, so this target is safe to re-run at any time.
+	@echo "Pulling generator model: $(GENERATOR_MODEL)"
+	ollama pull $(GENERATOR_MODEL)
+	@echo "Pulling embed model: $(EMBED_MODEL)"
+	ollama pull $(EMBED_MODEL)
+	@echo "Bootstrap complete. Run 'make ollama-status' to verify."
+
+ollama-status:
+	# Show host Ollama version and list all pulled models, highlighting whether
+	# the models required by this stack are present.
+	@curl -sf http://localhost:11434/api/version \
+	  | python3 -c "import sys,json; print('Ollama', json.load(sys.stdin).get('version','?'))" \
+	  || { echo "ERROR: Ollama not reachable at localhost:11434"; exit 1; }
+	@echo ""
+	@echo "Pulled models:"
+	@ollama list
+	@echo ""
+	@echo "Required by this stack (Makefile defaults, override with make var):"
+	@echo "  GENERATOR_MODEL = $(GENERATOR_MODEL)"
+	@echo "  EMBED_MODEL     = $(EMBED_MODEL)"
 
 reindex:
 	curl -s -X POST http://localhost:8000/reindex | jq .
diff --git a/README.md b/README.md
@@ -3,18 +3,51 @@
 A containerised RAG stack for your Markdown vault:
 - Indexes Markdown with **Markdown-header splitting** first, then **sentence-aware fallback**, and finally **char-based** fallback.
 - Persists embeddings in **Chroma**.
-- Uses **Ollama** for both generator (**Granite 4.0 Tiny-H**) and embedder (**nomic-embed-text**).
+- Uses **Ollama** for both generator (**Gemma 4 26B Q4**) and embedder (**nomic-embed-text**).
+- Ollama runs **on the host** (Metal GPU on macOS) for faster inference and embedding; the containers talk to it via `host.containers.internal`.
 - **Watchdog** sidecar auto-reindexes on vault changes (debounced).
 
 ## Quick start
-1. Edit `.env` and set `HOST_VAULT_PATH` to your Markdown vault absolute path.
-2. `make up`
-3. `make pull` (first run to cache models)
-4. Bring up the API (if it's not running) and start chatting:
+
+1. Install and start [Ollama](https://ollama.com) on your host machine (it must be running before the stack starts).
+2. Pull the required models:
+   ```bash
+   make ollama-bootstrap
+   ```
+3. Edit `.env` and set `HOST_VAULT_PATH` to your Markdown vault absolute path.
+4. Start the stack:
+   ```bash
+   make up
+   ```
+5. Start chatting:
+   ```bash
+   ./chat.sh
+   ```
+
+## Changing models
+
+Model names are defined as variables at the top of the `Makefile`:
+
+```makefile
+GENERATOR_MODEL ?= gemma4-26b-q4xl:latest
+EMBED_MODEL     ?= nomic-embed-text
+```
+
+To switch models, override them on the command line — no file edits required:
+
 ```bash
-./chat.sh
+# Pull and verify the new models first
+make ollama-bootstrap GENERATOR_MODEL=llama3.2:latest
+
+# Then start the stack with the same override
+make up GENERATOR_MODEL=llama3.2:latest
 ```
 
+The values are exported from Make and picked up by `docker-compose.yml` as environment variables.  If you want a permanent change, edit the two lines in `Makefile` directly.
+
+> **Note:** changing `EMBED_MODEL` requires a full reindex (`make reindex`) because
+> the new embedding model will produce incompatible vectors.
+
 ## Manual calls
 - Reindex: `make reindex` (also happens on startup, and on changes via watcher)
 - Query:
@@ -28,7 +61,7 @@ curl -X POST http://localhost:8000/query -H "Content-Type: application/json" \
 - **indexer** (`app/indexer.py`): Loads markdown, splits into chunks, extracts metadata, embeds and upserts to Chroma.
 - **name/date parsing**: `app/name_parser.py`, `app/date_parser.py` detect people terms and date ranges.
 - **watcher** (`app/watcher.py`): Monitors the vault and triggers partial reindex.
-- **models**: Served by local Ollama. See `Makefile: pull` target.
+- **models**: Served by Ollama on the host machine.
 
 Data flow (high-level):
 1. Markdown file changes → watcher posts changed paths → indexer extracts metadata and chunks → Chroma upsert.
@@ -55,15 +88,16 @@ markdown-rag/
 ## Configuration
 - **.env** (used by docker-compose):
   - `HOST_VAULT_PATH`: absolute path to your markdown vault on the host.
-  - `OLLAMA_BASE_URL`: override to use host Ollama (see “Use host Ollama”).
+- **Makefile variables** (source of truth for model names):
+  - `GENERATOR_MODEL`: LLM used for answering (default `gemma4-26b-q4xl:latest`).
+  - `EMBED_MODEL`: embedding model (default `nomic-embed-text`).
 - **Settings** (`app/settings.py`):
   - `index_path`: Chroma persistence directory.
   - `vault_path`: container path for mounted vault.
-  - `embed_model`: embedder name (e.g., `nomic-embed-text`).
-  - `generator_model`: LLM for answering (e.g., `ibm/granite4:tiny-h`).
   - `timezone`: used for date parsing and display.
 
 - **Container env (docker-compose.yml)**:
+  - `OLLAMA_BASE_URL`: points to `http://host.containers.internal:11434` so containers reach host Ollama.
   - `REINDEX_ON_START`: when `true`, `app/run.sh` calls `POST /reindex/scan` after the API boots to enqueue only changed/removed files since the last index state.
   - `WATCH_PATH`, `WATCH_DEBOUNCE_SECS`: tune watcher behavior.
   - `RAG_URL`, `RAG_FILES_URL` (watcher): endpoints for full and partial reindex (defaults are fine in docker-compose).
@@ -83,26 +117,68 @@ markdown-rag/
 
 ## Indexing & retrieval behavior
 - **Chunking**: header → sentence → char fallbacks to produce readable chunks.
-- **Metadata stored**: `title`, `source`, `entry_date` (when detected), `people` (derived from title, filename, headings, and parent folders). Vector store metadata is sanitized to primitives.
-- **Embeddings include metadata**: Each chunk text is prefixed with `[title] [people] [source] [date]` to strengthen person and title relevance.
+- **Metadata stored**: `title`, `source`, `entry_date` (from date headings, frontmatter `date` field, or file mtime — in that priority order), `tags` (from frontmatter), `entities` (derived from title, filename, headings, and parent folders). Vector store metadata is sanitized to primitives.
+- **Embeddings include metadata**: Each chunk text is prefixed with `[title] [entities] [source] [date] [tags]` to strengthen relevance in vector search.
 - **Dates**:
-  - Query rules like “today”, “last 2 weeks”, or explicit ranges parsed by `date_parser.py`.
+  - Query rules like "today", "last 2 weeks", or explicit ranges parsed by `date_parser.py`.
   - Retrieval filters strictly by date when a concrete window is parsed; otherwise a name-only fallback is used to avoid empty results.
 - **People**:
   - Names are extracted from queries (quotes/multi-word preferred; common non-name tokens filtered out).
-  - Retrieval requires all detected names to match `metadata.people` (or title/source) when any names are found.
+  - Retrieval requires all detected names to match `metadata.entities` (or title/source) when any names are found.
 
 ## Make targets
-- `make up` / `make down` / `make logs` / `make logs-watcher`
-- `make pull` → pull Ollama models into cache
-- `make reindex` → full incremental reindex across all files
-- `make reindex-scan` → changed-only scan then partial reindex (same as startup path)
-- `make reindex-files` → partial reindex for specific vault-relative paths
-- `make debug-retrieve` / `make debug-retrieve-dated` → inspect retrieval
-- `make parse-dates` → inspect date parsing
-- `make ask` / `make ask-stream` → quick interactive ask / streaming
-- `make test-install` → create `.venv` and install test dependencies
-- `make test` → run the unit test suite with coverage report
+
+### Ollama (host)
+
+| Target | Description |
+|--------|-------------|
+| `make ollama-bootstrap` | Pull `GENERATOR_MODEL` and `EMBED_MODEL` to the host Ollama. Safe to re-run — `ollama pull` skips models that are already current. Run this before first `make up` and whenever you change model names. |
+| `make ollama-status` | Show host Ollama version and list all pulled models alongside the model names required by the stack. |
+
+### Stack
+
+| Target | Description |
+|--------|-------------|
+| `make up` | Build images and start `rag` + `watcher` services. |
+| `make down` | Stop and remove containers. |
+| `make logs` | Tail `rag` container logs. |
+| `make logs-watcher` | Tail `watcher` container logs. |
+| `make ps` | Show container status. |
+| `make restart` | Restart all services. |
+| `make shell` | Open a bash shell inside the `rag` container. |
+
+### Indexing
+
+| Target | Description |
+|--------|-------------|
+| `make reindex` | Full incremental reindex across all vault files. |
+| `make reindex-scan` | Changed-only scan then partial reindex (same path as startup). |
+| `make reindex-files` | Partial reindex for specific vault-relative paths (prompts for input). |
+| `make reindex-status` | Show last reindex result. |
+
+### Querying / debugging
+
+| Target | Description |
+|--------|-------------|
+| `make ask` | Interactive single question (blocking). |
+| `make ask-stream` | Interactive single question (streaming). |
+| `make debug-retrieve` | Vector search only, no metadata in response. |
+| `make debug-retrieve-dated` | Vector search with full metadata (date, entities, etc.). |
+| `make parse-dates` | Test date parsing on a query. |
+
+### Podman machine (macOS)
+
+| Target | Description |
+|--------|-------------|
+| `make machine-init` | Create Podman VM (4 CPU, 8 GB RAM, 50 GB disk). |
+| `make machine-start` | Start an existing Podman VM. |
+
+### Tests
+
+| Target | Description |
+|--------|-------------|
+| `make test-install` | One-time setup: create `.venv` and install test dependencies. |
+| `make test` | Run the full test suite with coverage report. |
 
 ## Testing
 
@@ -132,21 +208,16 @@ make test           # run all tests with coverage
 | `name_parser.py` | 83% |
 
 ## Troubleshooting
-- **No results for sentence queries with a name**: ensure your notes have the person name in title, filename, headings, or a parent folder (so it gets into `people`). Run `make reindex`.
+- **No results for sentence queries with a name**: ensure your notes have the person name in title, filename, headings, or a parent folder (so it gets into `entities`). Run `make reindex`.
 - **List-valued metadata error**: we sanitize metadata to primitives; if you changed metadata shapes, re-run `make reindex`.
-- **Persist errors with Chroma**: new `langchain-chroma` handles persistence automatically; explicit `persist()` isn’t required.
-- **Using host Ollama**: set `OLLAMA_BASE_URL=http://host.containers.internal:11434` in `docker-compose.yml` and remove bundled `ollama` service if desired.
+- **Ollama not reachable**: ensure `ollama serve` is running on the host before `make up`. Verify with `make ollama-status`.
+- **Wrong model loaded**: the stack reads `GENERATOR_MODEL` / `EMBED_MODEL` at container start. If you changed them, run `make down && make up GENERATOR_MODEL=<new>`.
+- **Embedding model changed**: requires a full reindex — vectors from different embedding models are incompatible. Run `make reindex` after switching `EMBED_MODEL`.
 
 ## Watcher behavior
 - The watcher debounces file events and calls `POST /reindex/files` with exact changed paths.
 - If partial reindex fails, it falls back to `POST /reindex` (full) to self-heal.
 
-
-## Use host Ollama
-- Change `OLLAMA_BASE_URL` env for `rag` service to `http://host.containers.internal:11434`.
-- Optionally remove the `ollama` service.
-
 ## Notes
 - The loader **ignores** `.obsidian/` and expands `[[wikilinks]]` to their alias or target text.
 - Citations include front-matter fields when present (e.g., `title`, `tags`).
-
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,37 +1,20 @@
 services:
-  ollama:
-    image: ollama/ollama:latest
-    container_name: ollama
-    restart: unless-stopped
-    ports:
-      - "11434:11434"
-    volumes:
-      - ollama_models:/root/.ollama
-    # More reliable healthcheck under Podman
-    healthcheck:
-      test: ["CMD", "ollama", "list"]
-      interval: 10s
-      timeout: 5s
-      start_period: 25s
-      retries: 30
-    environment:
-      GIN_MODE: release
-
   rag:
     build:
       context: ./app
     container_name: markdown-rag
-    depends_on:
-      ollama:
-        condition: service_healthy
     environment:
       TIMEZONE: Europe/London
       TZ: Europe/London
-      OLLAMA_BASE_URL: http://ollama:11434
-      # To use host Ollama instead, override with:
-      # OLLAMA_BASE_URL: http://host.containers.internal:11434
-      GENERATOR_MODEL: ibm/granite4:tiny-h
-      EMBED_MODEL: nomic-embed-text
+      # Ollama runs on the host (Metal GPU on macOS) rather than in a container.
+      # host.containers.internal resolves to the host from inside Podman containers.
+      OLLAMA_BASE_URL: http://host.containers.internal:11434
+      # Model names default to the Makefile variables (GENERATOR_MODEL / EMBED_MODEL).
+      # When started via 'make up' those are exported into the environment and
+      # picked up here.  The :- fallbacks ensure 'podman compose up' works directly
+      # without going through make.
+      GENERATOR_MODEL: ${GENERATOR_MODEL:-gemma4-26b-q4xl:latest}
+      EMBED_MODEL: ${EMBED_MODEL:-nomic-embed-text}
       VAULT_PATH: /vault
       INDEX_PATH: /index/chroma
       SYSTEM_PROMPT_FILE: /app/system_prompt.txt
@@ -69,5 +52,4 @@ services:
     command: ["python", "watcher.py"]
 
 volumes:
-  ollama_models:
   chroma_index: