Skip to content

Commit 763195e

Browse files
committed
switch to using host ollama so models run on host GPU
1 parent f4f37b6 commit 763195e

3 files changed

Lines changed: 156 additions & 71 deletions

File tree

Makefile

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,19 @@
1-
.PHONY: up down logs logs-watcher pull reindex reindex-scan reindex-files reindex-status debug-retrieve debug-retrieve-dated parse-dates ask ask-stream chat shell check ps restart machine-start machine-init test-install test
1+
# ── model configuration ───────────────────────────────────────────────────────
2+
# These are the Ollama model names used by both the running stack and the
3+
# bootstrap target. Override on the command line to switch models without
4+
# editing any file:
5+
#
6+
# make ollama-bootstrap GENERATOR_MODEL=llama3.2:latest
7+
# make up GENERATOR_MODEL=llama3.2:latest
8+
#
9+
# The values are exported so podman compose inherits them as environment
10+
# variables, and docker-compose.yml references them as ${GENERATOR_MODEL} /
11+
# ${EMBED_MODEL} (with the same defaults as fallback for direct compose runs).
12+
GENERATOR_MODEL ?= gemma4-26b-q4xl:latest
13+
EMBED_MODEL ?= nomic-embed-text
14+
export GENERATOR_MODEL EMBED_MODEL
15+
16+
.PHONY: up down logs logs-watcher ollama-bootstrap ollama-status reindex reindex-scan reindex-files reindex-status debug-retrieve debug-retrieve-dated parse-dates ask ask-stream chat shell check ps restart machine-start machine-init test-install test
217

318
up:
419
podman compose -f docker-compose.yml up -d --build
@@ -12,16 +27,33 @@ logs:
1227
logs-watcher:
1328
podman logs -f markdown-rag-watcher
1429

15-
pull:
16-
# Ensure services are up so env vars are available
17-
podman compose -f docker-compose.yml up -d ollama rag
18-
# Use the rag container's env (GENERATOR_MODEL, EMBED_MODEL, OLLAMA_BASE_URL)
19-
podman exec -it markdown-rag bash -lc ' \
20-
echo "Pulling $$GENERATOR_MODEL via $$OLLAMA_BASE_URL"; \
21-
curl -s -X POST "$${OLLAMA_BASE_URL}/api/pull" -d "{\"name\":\"$${GENERATOR_MODEL}\"}" >/dev/null || true; \
22-
echo "Pulling $$EMBED_MODEL via $$OLLAMA_BASE_URL"; \
23-
curl -s -X POST "$${OLLAMA_BASE_URL}/api/pull" -d "{\"name\":\"$${EMBED_MODEL}\"}" >/dev/null || true \
24-
'
30+
ollama-bootstrap:
31+
# Verify host Ollama is reachable before attempting model pulls.
32+
@curl -sf http://localhost:11434/api/version >/dev/null || \
33+
{ echo "ERROR: Ollama not reachable at localhost:11434."; \
34+
echo " Start it with: ollama serve"; exit 1; }
35+
# Pull models via the Ollama CLI on the host. 'ollama pull' is idempotent:
36+
# it checks the local digest against the registry and skips the download if
37+
# the model is already current, so this target is safe to re-run at any time.
38+
@echo "Pulling generator model: $(GENERATOR_MODEL)"
39+
ollama pull $(GENERATOR_MODEL)
40+
@echo "Pulling embed model: $(EMBED_MODEL)"
41+
ollama pull $(EMBED_MODEL)
42+
@echo "Bootstrap complete. Run 'make ollama-status' to verify."
43+
44+
ollama-status:
45+
# Show host Ollama version and list all pulled models, highlighting whether
46+
# the models required by this stack are present.
47+
@curl -sf http://localhost:11434/api/version \
48+
| python3 -c "import sys,json; print('Ollama', json.load(sys.stdin).get('version','?'))" \
49+
|| { echo "ERROR: Ollama not reachable at localhost:11434"; exit 1; }
50+
@echo ""
51+
@echo "Pulled models:"
52+
@ollama list
53+
@echo ""
54+
@echo "Required by this stack (Makefile defaults, override with make var):"
55+
@echo " GENERATOR_MODEL = $(GENERATOR_MODEL)"
56+
@echo " EMBED_MODEL = $(EMBED_MODEL)"
2557

2658
reindex:
2759
curl -s -X POST http://localhost:8000/reindex | jq .

README.md

Lines changed: 104 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,51 @@
33
A containerised RAG stack for your Markdown vault:
44
- Indexes Markdown with **Markdown-header splitting** first, then **sentence-aware fallback**, and finally **char-based** fallback.
55
- Persists embeddings in **Chroma**.
6-
- Uses **Ollama** for both generator (**Granite 4.0 Tiny-H**) and embedder (**nomic-embed-text**).
6+
- Uses **Ollama** for both generator (**Gemma 4 26B Q4**) and embedder (**nomic-embed-text**).
7+
- Ollama runs **on the host** (Metal GPU on macOS) for faster inference and embedding; the containers talk to it via `host.containers.internal`.
78
- **Watchdog** sidecar auto-reindexes on vault changes (debounced).
89

910
## Quick start
10-
1. Edit `.env` and set `HOST_VAULT_PATH` to your Markdown vault absolute path.
11-
2. `make up`
12-
3. `make pull` (first run to cache models)
13-
4. Bring up the API (if it's not running) and start chatting:
11+
12+
1. Install and start [Ollama](https://ollama.com) on your host machine (it must be running before the stack starts).
13+
2. Pull the required models:
14+
```bash
15+
make ollama-bootstrap
16+
```
17+
3. Edit `.env` and set `HOST_VAULT_PATH` to your Markdown vault absolute path.
18+
4. Start the stack:
19+
```bash
20+
make up
21+
```
22+
5. Start chatting:
23+
```bash
24+
./chat.sh
25+
```
26+
27+
## Changing models
28+
29+
Model names are defined as variables at the top of the `Makefile`:
30+
31+
```makefile
32+
GENERATOR_MODEL ?= gemma4-26b-q4xl:latest
33+
EMBED_MODEL ?= nomic-embed-text
34+
```
35+
36+
To switch models, override them on the command line — no file edits required:
37+
1438
```bash
15-
./chat.sh
39+
# Pull and verify the new models first
40+
make ollama-bootstrap GENERATOR_MODEL=llama3.2:latest
41+
42+
# Then start the stack with the same override
43+
make up GENERATOR_MODEL=llama3.2:latest
1644
```
1745

46+
The values are exported from Make and picked up by `docker-compose.yml` as environment variables. If you want a permanent change, edit the two lines in `Makefile` directly.
47+
48+
> **Note:** changing `EMBED_MODEL` requires a full reindex (`make reindex`) because
49+
> the new embedding model will produce incompatible vectors.
50+
1851
## Manual calls
1952
- Reindex: `make reindex` (also happens on startup, and on changes via watcher)
2053
- Query:
@@ -28,7 +61,7 @@ curl -X POST http://localhost:8000/query -H "Content-Type: application/json" \
2861
- **indexer** (`app/indexer.py`): Loads markdown, splits into chunks, extracts metadata, embeds and upserts to Chroma.
2962
- **name/date parsing**: `app/name_parser.py`, `app/date_parser.py` detect people terms and date ranges.
3063
- **watcher** (`app/watcher.py`): Monitors the vault and triggers partial reindex.
31-
- **models**: Served by local Ollama. See `Makefile: pull` target.
64+
- **models**: Served by Ollama on the host machine.
3265

3366
Data flow (high-level):
3467
1. Markdown file changes → watcher posts changed paths → indexer extracts metadata and chunks → Chroma upsert.
@@ -55,15 +88,16 @@ markdown-rag/
5588
## Configuration
5689
- **.env** (used by docker-compose):
5790
- `HOST_VAULT_PATH`: absolute path to your markdown vault on the host.
58-
- `OLLAMA_BASE_URL`: override to use host Ollama (see “Use host Ollama”).
91+
- **Makefile variables** (source of truth for model names):
92+
- `GENERATOR_MODEL`: LLM used for answering (default `gemma4-26b-q4xl:latest`).
93+
- `EMBED_MODEL`: embedding model (default `nomic-embed-text`).
5994
- **Settings** (`app/settings.py`):
6095
- `index_path`: Chroma persistence directory.
6196
- `vault_path`: container path for mounted vault.
62-
- `embed_model`: embedder name (e.g., `nomic-embed-text`).
63-
- `generator_model`: LLM for answering (e.g., `ibm/granite4:tiny-h`).
6497
- `timezone`: used for date parsing and display.
6598

6699
- **Container env (docker-compose.yml)**:
100+
- `OLLAMA_BASE_URL`: points to `http://host.containers.internal:11434` so containers reach host Ollama.
67101
- `REINDEX_ON_START`: when `true`, `app/run.sh` calls `POST /reindex/scan` after the API boots to enqueue only changed/removed files since the last index state.
68102
- `WATCH_PATH`, `WATCH_DEBOUNCE_SECS`: tune watcher behavior.
69103
- `RAG_URL`, `RAG_FILES_URL` (watcher): endpoints for full and partial reindex (defaults are fine in docker-compose).
@@ -83,26 +117,68 @@ markdown-rag/
83117

84118
## Indexing & retrieval behavior
85119
- **Chunking**: header → sentence → char fallbacks to produce readable chunks.
86-
- **Metadata stored**: `title`, `source`, `entry_date` (when detected), `people` (derived from title, filename, headings, and parent folders). Vector store metadata is sanitized to primitives.
87-
- **Embeddings include metadata**: Each chunk text is prefixed with `[title] [people] [source] [date]` to strengthen person and title relevance.
120+
- **Metadata stored**: `title`, `source`, `entry_date` (from date headings, frontmatter `date` field, or file mtime — in that priority order), `tags` (from frontmatter), `entities` (derived from title, filename, headings, and parent folders). Vector store metadata is sanitized to primitives.
121+
- **Embeddings include metadata**: Each chunk text is prefixed with `[title] [entities] [source] [date] [tags]` to strengthen relevance in vector search.
88122
- **Dates**:
89-
- Query rules like today”, “last 2 weeks, or explicit ranges parsed by `date_parser.py`.
123+
- Query rules like "today", "last 2 weeks", or explicit ranges parsed by `date_parser.py`.
90124
- Retrieval filters strictly by date when a concrete window is parsed; otherwise a name-only fallback is used to avoid empty results.
91125
- **People**:
92126
- Names are extracted from queries (quotes/multi-word preferred; common non-name tokens filtered out).
93-
- Retrieval requires all detected names to match `metadata.people` (or title/source) when any names are found.
127+
- Retrieval requires all detected names to match `metadata.entities` (or title/source) when any names are found.
94128

95129
## Make targets
96-
- `make up` / `make down` / `make logs` / `make logs-watcher`
97-
- `make pull` → pull Ollama models into cache
98-
- `make reindex` → full incremental reindex across all files
99-
- `make reindex-scan` → changed-only scan then partial reindex (same as startup path)
100-
- `make reindex-files` → partial reindex for specific vault-relative paths
101-
- `make debug-retrieve` / `make debug-retrieve-dated` → inspect retrieval
102-
- `make parse-dates` → inspect date parsing
103-
- `make ask` / `make ask-stream` → quick interactive ask / streaming
104-
- `make test-install` → create `.venv` and install test dependencies
105-
- `make test` → run the unit test suite with coverage report
130+
131+
### Ollama (host)
132+
133+
| Target | Description |
134+
|--------|-------------|
135+
| `make ollama-bootstrap` | Pull `GENERATOR_MODEL` and `EMBED_MODEL` to the host Ollama. Safe to re-run — `ollama pull` skips models that are already current. Run this before first `make up` and whenever you change model names. |
136+
| `make ollama-status` | Show host Ollama version and list all pulled models alongside the model names required by the stack. |
137+
138+
### Stack
139+
140+
| Target | Description |
141+
|--------|-------------|
142+
| `make up` | Build images and start `rag` + `watcher` services. |
143+
| `make down` | Stop and remove containers. |
144+
| `make logs` | Tail `rag` container logs. |
145+
| `make logs-watcher` | Tail `watcher` container logs. |
146+
| `make ps` | Show container status. |
147+
| `make restart` | Restart all services. |
148+
| `make shell` | Open a bash shell inside the `rag` container. |
149+
150+
### Indexing
151+
152+
| Target | Description |
153+
|--------|-------------|
154+
| `make reindex` | Full incremental reindex across all vault files. |
155+
| `make reindex-scan` | Changed-only scan then partial reindex (same path as startup). |
156+
| `make reindex-files` | Partial reindex for specific vault-relative paths (prompts for input). |
157+
| `make reindex-status` | Show last reindex result. |
158+
159+
### Querying / debugging
160+
161+
| Target | Description |
162+
|--------|-------------|
163+
| `make ask` | Interactive single question (blocking). |
164+
| `make ask-stream` | Interactive single question (streaming). |
165+
| `make debug-retrieve` | Vector search only, no metadata in response. |
166+
| `make debug-retrieve-dated` | Vector search with full metadata (date, entities, etc.). |
167+
| `make parse-dates` | Test date parsing on a query. |
168+
169+
### Podman machine (macOS)
170+
171+
| Target | Description |
172+
|--------|-------------|
173+
| `make machine-init` | Create Podman VM (4 CPU, 8 GB RAM, 50 GB disk). |
174+
| `make machine-start` | Start an existing Podman VM. |
175+
176+
### Tests
177+
178+
| Target | Description |
179+
|--------|-------------|
180+
| `make test-install` | One-time setup: create `.venv` and install test dependencies. |
181+
| `make test` | Run the full test suite with coverage report. |
106182

107183
## Testing
108184

@@ -132,21 +208,16 @@ make test # run all tests with coverage
132208
| `name_parser.py` | 83% |
133209

134210
## Troubleshooting
135-
- **No results for sentence queries with a name**: ensure your notes have the person name in title, filename, headings, or a parent folder (so it gets into `people`). Run `make reindex`.
211+
- **No results for sentence queries with a name**: ensure your notes have the person name in title, filename, headings, or a parent folder (so it gets into `entities`). Run `make reindex`.
136212
- **List-valued metadata error**: we sanitize metadata to primitives; if you changed metadata shapes, re-run `make reindex`.
137-
- **Persist errors with Chroma**: new `langchain-chroma` handles persistence automatically; explicit `persist()` isn’t required.
138-
- **Using host Ollama**: set `OLLAMA_BASE_URL=http://host.containers.internal:11434` in `docker-compose.yml` and remove bundled `ollama` service if desired.
213+
- **Ollama not reachable**: ensure `ollama serve` is running on the host before `make up`. Verify with `make ollama-status`.
214+
- **Wrong model loaded**: the stack reads `GENERATOR_MODEL` / `EMBED_MODEL` at container start. If you changed them, run `make down && make up GENERATOR_MODEL=<new>`.
215+
- **Embedding model changed**: requires a full reindex — vectors from different embedding models are incompatible. Run `make reindex` after switching `EMBED_MODEL`.
139216

140217
## Watcher behavior
141218
- The watcher debounces file events and calls `POST /reindex/files` with exact changed paths.
142219
- If partial reindex fails, it falls back to `POST /reindex` (full) to self-heal.
143220

144-
145-
## Use host Ollama
146-
- Change `OLLAMA_BASE_URL` env for `rag` service to `http://host.containers.internal:11434`.
147-
- Optionally remove the `ollama` service.
148-
149221
## Notes
150222
- The loader **ignores** `.obsidian/` and expands `[[wikilinks]]` to their alias or target text.
151223
- Citations include front-matter fields when present (e.g., `title`, `tags`).
152-

docker-compose.yml

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,20 @@
11
services:
2-
ollama:
3-
image: ollama/ollama:latest
4-
container_name: ollama
5-
restart: unless-stopped
6-
ports:
7-
- "11434:11434"
8-
volumes:
9-
- ollama_models:/root/.ollama
10-
# More reliable healthcheck under Podman
11-
healthcheck:
12-
test: ["CMD", "ollama", "list"]
13-
interval: 10s
14-
timeout: 5s
15-
start_period: 25s
16-
retries: 30
17-
environment:
18-
GIN_MODE: release
19-
202
rag:
213
build:
224
context: ./app
235
container_name: markdown-rag
24-
depends_on:
25-
ollama:
26-
condition: service_healthy
276
environment:
287
TIMEZONE: Europe/London
298
TZ: Europe/London
30-
OLLAMA_BASE_URL: http://ollama:11434
31-
# To use host Ollama instead, override with:
32-
# OLLAMA_BASE_URL: http://host.containers.internal:11434
33-
GENERATOR_MODEL: ibm/granite4:tiny-h
34-
EMBED_MODEL: nomic-embed-text
9+
# Ollama runs on the host (Metal GPU on macOS) rather than in a container.
10+
# host.containers.internal resolves to the host from inside Podman containers.
11+
OLLAMA_BASE_URL: http://host.containers.internal:11434
12+
# Model names default to the Makefile variables (GENERATOR_MODEL / EMBED_MODEL).
13+
# When started via 'make up' those are exported into the environment and
14+
# picked up here. The :- fallbacks ensure 'podman compose up' works directly
15+
# without going through make.
16+
GENERATOR_MODEL: ${GENERATOR_MODEL:-gemma4-26b-q4xl:latest}
17+
EMBED_MODEL: ${EMBED_MODEL:-nomic-embed-text}
3518
VAULT_PATH: /vault
3619
INDEX_PATH: /index/chroma
3720
SYSTEM_PROMPT_FILE: /app/system_prompt.txt
@@ -69,5 +52,4 @@ services:
6952
command: ["python", "watcher.py"]
7053

7154
volumes:
72-
ollama_models:
7355
chroma_index:

0 commit comments

Comments
 (0)