-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.training.yml
More file actions
122 lines (119 loc) · 4.78 KB
/
docker-compose.training.yml
File metadata and controls
122 lines (119 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# docker-compose.training.yml
#
# Sovereign Forge opt-in overlay — fine-tuning pipeline + Ray compute mesh.
#
# Adds:
# training-api — REST API for datasets, jobs, and model registry (:9001)
# training-worker — GPU worker: Ray head node, DeploymentSupervisor actor,
# dynamic vLLM container spawning via Docker SDK
#
# USAGE:
# pdavid --mode up --training
# pdavid --mode up --training --vllm # + static vLLM inference server
# pdavid --mode up --gpu --training # full sovereign stack
#
# Or manually:
# docker compose -f docker-compose.yml -f docker-compose.training.yml up -d
#
# REQUIREMENTS:
# - NVIDIA GPU with drivers installed
# - NVIDIA Container Toolkit
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
# - /var/run/docker.sock accessible (worker spawns vLLM containers via Docker SDK)
# - HF_TOKEN set in .env for gated model downloads
# pdavid configure --set HF_TOKEN=hf_abc123
#
# SCALE-OUT:
# To join a second GPU machine to the Ray cluster, set on the remote machine:
# RAY_ADDRESS=ray://<head_ip>:10001
# Then: docker compose -f docker-compose.yml -f docker-compose.training.yml up -d training-worker
# No code changes required — Ray discovers the node automatically.
#
# IMAGE PINNING
# ─────────────
# Pinned: 2026-04-07 | Core: v1.27.x
services:
# ---------------------------------------------------------------------------
# training-api — Fine-tuning REST API (no GPU required)
# ---------------------------------------------------------------------------
training-api:
# v1.27.x — upgrade to next core release on next bump
image: thanosprime/projectdavid-core-training-api@sha256:e10f351081de1e927214a2a0e68a6e3c539b0480357ac77bf877c0a14177469a
container_name: training_api
restart: unless-stopped
env_file:
- .env
environment:
- DATABASE_URL=${DATABASE_URL}
- SECRET_KEY=${SECRET_KEY}
- DEFAULT_SECRET_KEY=${DEFAULT_SECRET_KEY}
- REDIS_URL=redis://redis:6379/0
- ASSISTANTS_BASE_URL=http://api:9000
- WORKER_API_KEY=${ADMIN_API_KEY}
- SANDBOX_AUTH_SECRET=${SANDBOX_AUTH_SECRET}
- SHARED_PATH=/mnt/training_data
- PYTHONUNBUFFERED=1
# training-api queries Ray via dashboard HTTP API only.
# No direct GCS connection — RAY_ADDRESS not needed here.
ports:
- "9001:9001"
volumes:
- ${SHARED_PATH:-./shared_data}:/mnt/training_data
depends_on:
- redis
- training-worker
networks:
- my_custom_network
# ---------------------------------------------------------------------------
# training-worker — GPU worker + Ray head node
# ---------------------------------------------------------------------------
training-worker:
# v1.27.x — upgrade to next core release on next bump
image: thanosprime/projectdavid-core-training-worker@sha256:e10f351081de1e927214a2a0e68a6e3c539b0480357ac77bf877c0a14177469a
container_name: training_worker
restart: unless-stopped
# The published image has an incorrect default CMD (uvicorn).
# This override boots the correct Ray head node + DeploymentSupervisor entrypoint.
# NOTE: do NOT mount ./src:/app/src here — it would shadow the image's /app/src
# and hide worker.py, causing "No such file or directory" at startup.
command: ["python", "/app/src/api/training/worker.py"]
env_file:
- .env
runtime: nvidia
shm_size: '5gb'
environment:
- RAY_CLIENT_SERVER_PORT=10001
- TRAINING_PROFILE=${TRAINING_PROFILE:-standard}
- DATABASE_URL=${DATABASE_URL}
- REDIS_URL=redis://redis:6379/0
- ASSISTANTS_BASE_URL=http://api:9000
- WORKER_API_KEY=${ADMIN_API_KEY}
- SHARED_PATH=/mnt/training_data
- HF_TOKEN=${HF_TOKEN:-}
- HF_HOME=/root/.cache/huggingface
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- PYTHONUNBUFFERED=1
# Ray: unset = start as head node (default).
# Set to ray://<head_ip>:10001 to join an existing cluster as a worker.
- RAY_ADDRESS=${RAY_ADDRESS:-}
- RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
ports:
- "8265:8265" # Ray dashboard — http://localhost:8265
- "10001:10001" # Ray client protocol (external node join)
volumes:
# Docker SDK access — required for dynamic vLLM container spawning
- /var/run/docker.sock:/var/run/docker.sock
- ${SHARED_PATH:-./shared_data}:/mnt/training_data
- ${HF_CACHE_PATH:-~/.cache/huggingface}:/root/.cache/huggingface
depends_on:
- redis
networks:
- my_custom_network
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu ]