Merge pull request #773 from docker/conveinience

ericcurtin · web-flow · commit c6c66e207c2d · 2026-03-23T12:28:46.000Z
Add dmr dev convenience wrapper
diff --git a/Makefile b/Makefile
@@ -22,27 +22,28 @@ DOCKER_BUILD_ARGS := \
 	--target $(DOCKER_TARGET) \
 	-t $(DOCKER_IMAGE)
 
-# Test configuration
-BUILD_DMR ?= 1
-
 # Phony targets grouped by category
-.PHONY: build run clean test integration-tests build-cli install-cli
+.PHONY: build build-cli build-dmr install-cli run clean test integration-tests
 .PHONY: validate validate-all lint help
 .PHONY: docker-build docker-build-multiplatform docker-run docker-run-impl
 .PHONY: docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang
 .PHONY: test-docker-ce-installation
 .PHONY: vllm-metal-build vllm-metal-install vllm-metal-dev vllm-metal-clean
 .PHONY: diffusers-build diffusers-install diffusers-dev diffusers-clean
-# Default target
+# Default target: build server, CLI plugin, and dmr convenience wrapper
 .DEFAULT_GOAL := build
 
-# Build the Go application
-build:
+build: build-server build-cli build-dmr
+
+build-server:
 	CGO_ENABLED=1 go build -ldflags="-s -w -X main.Version=$(shell git describe --tags --always --dirty --match 'v*')" -o $(APP_NAME) .
 
 build-cli:
 	$(MAKE) -C cmd/cli
 
+build-dmr:
+	go build -ldflags="-s -w" -o dmr ./cmd/dmr
+
 install-cli:
 	$(MAKE) -C cmd/cli install
 
@@ -61,6 +62,7 @@ run: build
 # Clean build artifacts
 clean:
 	rm -f $(APP_NAME)
+	rm -f dmr
 	rm -f model-runner.sock
 
 # Run tests
@@ -77,7 +79,7 @@ integration-tests:
 		echo "$$INVALID_TESTS" | sed 's/func \([^(]*\).*/\1/'; \
 		exit 1; \
 	fi
-	@BUILD_DMR=$(BUILD_DMR) go test -v -race -count=1 -tags=integration -run "^TestIntegration" -timeout=5m ./cmd/cli/commands
+	go test -v -race -count=1 -tags=integration -run "^TestIntegration" -timeout=5m ./cmd/cli/commands
 	@echo "Integration tests completed!"
 
 test-docker-ce-installation:
@@ -308,7 +310,8 @@ diffusers-clean:
 
 help:
 	@echo "Available targets:"
-	@echo "  build				- Build the Go application"
+	@echo "  build				- Build server, CLI plugin, and dmr wrapper (default)"
+	@echo "  build-server			- Build the model-runner server"
 	@echo "  build-cli			- Build the CLI (docker-model plugin)"
 	@echo "  install-cli			- Build and install the CLI as a Docker plugin"
 	@echo "  docs				- Generate CLI documentation"
diff --git a/README.md b/README.md
@@ -96,60 +96,40 @@ Before building from source, ensure you have the following installed:
 
 ### Building the Complete Stack
 
-#### Step 1: Clone and Build model-runner (Server/Daemon)
+After cloning, a single `make` builds everything — the server, CLI plugin, and a `dmr` convenience wrapper:
 
 ```bash
-# Clone the model-runner repository
-git clone https://github.com/docker/model-runner.git
-cd model-runner
-
-# Build the model-runner binary
-make build
-
-# Or build with specific backend arguments
-make run LLAMA_ARGS="--verbose --jinja -ngl 999 --ctx-size 2048"
-
-# Run tests to verify the build
-make test
+make
 ```
 
-The `model-runner` binary will be created in the current directory. This is the backend server that manages models.
-
-#### Step 2: Build model-cli (Client)
+`dmr` starts the server on a free port, waits for it to be ready, runs your CLI command, then shuts the server down:
 
 ```bash
-# From the root directory, navigate to the model-cli directory
-cd cmd/cli
-
-# Build the CLI binary
-make build
-
-# The binary will be named 'model-cli'
-# Optionally, install it as a Docker CLI plugin
-make install  # This will link it to ~/.docker/cli-plugins/docker-model
+./dmr run ai/smollm2 "Hello, how are you?"
+./dmr ls
+./dmr run qwen3:0.6B-Q4_0 tell me today's news
 ```
 
+These components can also be built, run, and tested separately using the Makefile.
+
 ### Testing the Complete Stack End-to-End
 
 > **Note:** We use port 13434 in these examples to avoid conflicts with Docker Desktop's built-in Model Runner, which typically runs on port 12434.
 
-#### Option 1: Local Development (Recommended for Contributors)
+#### Option 1: Manual two-terminal setup
 
 1. **Start model-runner in one terminal:**
 ```bash
-cd model-runner
 MODEL_RUNNER_PORT=13434 ./model-runner
-# The server will start on port 13434
 ```
 
 2. **Use model-cli in another terminal:**
 ```bash
-cd cmd/cli
-# List available models (connecting to port 13434)
-MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli list
+# List available models
+MODEL_RUNNER_HOST=http://localhost:13434 ./cmd/cli/model-cli list
 
 # Pull and run a model
-MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli run ai/smollm2 "Hello, how are you?"
+MODEL_RUNNER_HOST=http://localhost:13434 ./cmd/cli/model-cli run ai/smollm2 "Hello, how are you?"
 ```
 
 #### Option 2: Using Docker
@@ -422,6 +402,118 @@ in the form of [a Helm chart and static YAML](charts/docker-model-runner/README.
 If you are interested in a specific Kubernetes use-case, please start a
 discussion on the issue tracker.
 
+<<<<<<< Updated upstream
+=======
+## dmrlet: Container Orchestrator for AI Inference
+
+dmrlet is a purpose-built container orchestrator for AI inference workloads. Unlike Kubernetes, it focuses exclusively on running stateless inference containers with zero configuration overhead. Multi-GPU mapping "just works" without YAML, device plugins, or node selectors.
+
+### Key Features
+
+| Feature | Kubernetes | dmrlet |
+|---------|------------|--------|
+| Multi-GPU setup | Device plugins + node selectors + resource limits YAML | `dmrlet serve llama3 --gpus all` |
+| Config overhead | 50+ lines of YAML minimum | Zero YAML, CLI-only |
+| Time to first inference | Minutes (pod scheduling, image pull) | Seconds (model already local) |
+| Model management | External (mount PVCs, manage yourself) | Integrated with Docker Model Runner store |
+
+### Building dmrlet
+
+```bash
+# Build the dmrlet binary
+go build -o dmrlet ./cmd/dmrlet
+
+# Verify it works
+./dmrlet --help
+```
+
+### Usage
+
+**Start the daemon:**
+```bash
+# Start in foreground
+dmrlet daemon
+
+# With custom socket path
+dmrlet daemon --socket /tmp/dmrlet.sock
+```
+
+**Serve a model:**
+```bash
+# Auto-detect backend and GPUs
+dmrlet serve llama3.2
+
+# Specify backend
+dmrlet serve llama3.2 --backend vllm
+
+# Specify GPU allocation
+dmrlet serve llama3.2 --gpus 0,1
+dmrlet serve llama3.2 --gpus all
+
+# Multiple replicas
+dmrlet serve llama3.2 --replicas 2
+
+# Backend-specific options
+dmrlet serve llama3.2 --ctx-size 4096      # llama.cpp context size
+dmrlet serve llama3.2 --gpu-memory 0.8     # vLLM GPU memory utilization
+```
+
+**List running models:**
+```bash
+dmrlet ps
+# MODEL          BACKEND    REPLICAS   GPUS      ENDPOINTS              STATUS
+# llama3.2       llama.cpp  1          [0,1,2,3] localhost:30000        healthy
+```
+
+**View logs:**
+```bash
+dmrlet logs llama3.2        # Last 100 lines
+dmrlet logs llama3.2 -f     # Follow logs
+```
+
+**Scale replicas:**
+```bash
+dmrlet scale llama3.2 4     # Scale to 4 replicas
+```
+
+**Stop a model:**
+```bash
+dmrlet stop llama3.2
+dmrlet stop --all           # Stop all models
+```
+
+**Check status:**
+```bash
+dmrlet status
+# DAEMON: running
+# SOCKET: /var/run/dmrlet.sock
+#
+# GPUs:
+#   GPU 0:  NVIDIA A100 80GB  81920MB  (in use: llama3.2)
+#   GPU 1:  NVIDIA A100 80GB  81920MB  (available)
+#
+# MODELS: 1 running
+```
+
+### Supported Backends
+
+- **llama.cpp** - Default backend for GGUF models
+- **vLLM** - High-throughput serving for safetensors models
+- **SGLang** - Fast serving with RadixAttention
+
+### Architecture
+
+```
+dmrlet daemon
+  ├── GPU Manager      - Auto-detect and allocate GPUs
+  ├── Container Manager - Docker-based container lifecycle
+  ├── Service Registry  - Endpoint discovery with load balancing
+  ├── Health Monitor    - Auto-restart unhealthy containers
+  ├── Auto-scaler       - Scale based on QPS/latency/GPU utilization
+  └── Log Aggregator    - Centralized log collection
+```
+
+>>>>>>> Stashed changes
 ## Community
 
 For general questions and discussion, please use [Docker Model Runner's Slack channel](https://dockercommunity.slack.com/archives/C09H9P5E57B).
diff --git a/cmd/dmr/main.go b/cmd/dmr/main.go
@@ -0,0 +1,130 @@
+// dmr is a developer convenience wrapper that starts the model-runner server on
+// a free port and runs a model-cli command against it in one step.
+//
+// Usage: dmr <cli-args...>
+//
+// Example: dmr run qwen3:0.6B-Q4_0 tell me today's news
+package main
+
+import (
+	"errors"
+	"fmt"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"strconv"
+	"syscall"
+	"time"
+)
+
+func freePort() (int, error) {
+	l, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		return 0, err
+	}
+	defer l.Close()
+	return l.Addr().(*net.TCPAddr).Port, nil
+}
+
+func waitForServer(url string, timeout time.Duration) error {
+	client := &http.Client{Timeout: time.Second}
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		resp, err := client.Get(url)
+		if err == nil {
+			resp.Body.Close()
+			if resp.StatusCode == http.StatusOK {
+				return nil
+			}
+		}
+		time.Sleep(200 * time.Millisecond)
+	}
+	return fmt.Errorf("server not ready after %s", timeout)
+}
+
+func checkBinary(path, name, expectedLayout string) error {
+	if _, err := os.Stat(path); os.IsNotExist(err) {
+		return fmt.Errorf("missing %s binary at %s\n\nExpected directory layout:\n%s\n\nPlease run 'make build' to build all binaries", name, path, expectedLayout)
+	}
+	return nil
+}
+
+func main() {
+	self, err := os.Executable()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
+		os.Exit(1)
+	}
+	dir := filepath.Dir(self)
+
+	serverBin := filepath.Join(dir, "model-runner")
+	cliBin := filepath.Join(dir, "cmd", "cli", "model-cli")
+
+	expectedLayout := fmt.Sprintf(`%s/
+├── model-runner          (server binary)
+├── dmr                   (this wrapper)
+└── cmd/
+    └── cli/
+        └── model-cli     (CLI binary)`, dir)
+
+	if err := checkBinary(serverBin, "model-runner", expectedLayout); err != nil {
+		fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
+		os.Exit(1)
+	}
+	if err := checkBinary(cliBin, "model-cli", expectedLayout); err != nil {
+		fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
+		os.Exit(1)
+	}
+
+	port, err := freePort()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "dmr: failed to find free port: %v\n", err)
+		os.Exit(1)
+	}
+	portStr := strconv.Itoa(port)
+	serverURL := "http://localhost:" + portStr
+
+	fmt.Fprintf(os.Stderr, "dmr: starting model-runner on port %d\n", port)
+
+	server := exec.Command(serverBin)
+	server.Env = append(os.Environ(), "MODEL_RUNNER_PORT="+portStr)
+	server.Stderr = os.Stderr
+	server.Stdout = os.Stdout
+
+	if err := server.Start(); err != nil {
+		fmt.Fprintf(os.Stderr, "dmr: failed to start model-runner: %v\n", err)
+		os.Exit(1)
+	}
+	defer server.Process.Kill()
+
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+	go func() {
+		<-sigCh
+		server.Process.Kill()
+	}()
+
+	if err := waitForServer(serverURL+"/", 30*time.Second); err != nil {
+		fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
+		os.Exit(1)
+	}
+
+	// #nosec G702 - Intentional: dmr is a CLI wrapper that forwards arguments to model-cli
+	cli := exec.Command(cliBin, os.Args[1:]...)
+	cli.Env = append(os.Environ(), "MODEL_RUNNER_HOST="+serverURL)
+	cli.Stdin = os.Stdin
+	cli.Stdout = os.Stdout
+	cli.Stderr = os.Stderr
+
+	if err := cli.Run(); err != nil {
+		var exitErr *exec.ExitError
+		if errors.As(err, &exitErr) {
+			os.Exit(exitErr.ExitCode())
+		}
+		fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
+		os.Exit(1)
+	}
+}