-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.example.yml
More file actions
102 lines (97 loc) · 2.67 KB
/
docker-compose.example.yml
File metadata and controls
102 lines (97 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
version: '3.9'
services:
# Ollama GPU Service
ollama-gpu:
image: ollama/ollama:0.11.4
pull_policy: if_not_present
restart: unless-stopped
volumes:
- ollama_gpu_data:/root/.ollama
- ./ollama/ollama.json:/root/.ollama/ollama.json
# Mount custom GGUF models directory
# Place your .gguf files in ./models/gguf/
# Create Modelfiles in ./models/modelfiles/
# Then use: docker exec ollama-gpu ollama create <name> -f /models/modelfiles/<file>.Modelfile
- ./models:/models
networks:
- ollama_network
environment:
- OLLAMA_DEBUG=1
- OLLAMA_KEEP_ALIVE=60m
- OLLAMA_VERBOSE=1
- OLLAMA_HOST=0.0.0.0:11434
- OLLAMA_NUM_PARALLEL=1
- OLLAMA_NUM_CTX=16384
- OLLAMA_NUM_THREAD=8
- OLLAMA_MLOCK=false
- OLLAMA_NUM_BATCH=512 # Higher value for better performance
shm_size: 4gb # Increase from default 64MB to 4GB
oom_kill_disable: true
command: serve
deploy:
resources:
limits:
memory: 36G
reservations:
memory: 16G
devices:
- driver: nvidia
count: 1 # Use only 1 GPU
capabilities: [gpu]
ulimits:
memlock: -1 # Unlimited locked memory
# Ollama CPU Service
ollama-cpu:
image: ollama/ollama:0.11.4
pull_policy: if_not_present
restart: unless-stopped
volumes:
- ollama_cpu_data:/root/.ollama
- ./ollama/ollama.json:/root/.ollama/ollama.json
networks:
- ollama_network
environment:
- OLLAMA_DEBUG=1
- OLLAMA_KEEP_ALIVE=60m
- OLLAMA_VERBOSE=1
- OLLAMA_HOST=0.0.0.0:11434
- OLLAMA_NUM_PARALLEL=4 # More parallel requests for CPU
- OLLAMA_NUM_CTX=16384
- OLLAMA_NUM_THREAD=16 # More threads for CPU processing
- OLLAMA_MLOCK=false
- OLLAMA_NUM_BATCH=512
deploy:
resources:
limits:
memory: 16G
reservations:
memory: 8G
command: serve
proxy:
build: .
restart: unless-stopped
environment:
- API_KEY=${API_KEY} # Set this in your environment or .env file
- OLLAMA_GPU_URL=${OLLAMA_GPU_URL:-http://ollama-gpu:11434}
- OLLAMA_CPU_URL=${OLLAMA_CPU_URL:-http://ollama-cpu:11434}
depends_on:
- ollama-gpu
- ollama-cpu
networks:
- ollama_network
# Cloudflared Tunnel (optional)
cloudflared:
image: cloudflare/cloudflared:latest
restart: unless-stopped
networks:
- ollama_network
volumes:
- ./cloudflare:/etc/cloudflared
command: tunnel --no-autoupdate run
depends_on:
- proxy
volumes:
ollama_gpu_data:
ollama_cpu_data:
networks:
ollama_network: