Merge pull request #880 from docker/fix/vllm-metal-chat-template

ericcurtin · web-flow · commit 9df7e71bef9c · 2026-04-22T11:07:01.000+01:00
fix: pass chat template to vllm-metal backend
diff --git a/pkg/inference/backends/vllm/vllm_config.go b/pkg/inference/backends/vllm/vllm_config.go
@@ -40,6 +40,13 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
 	// Add socket arguments
 	args = append(args, "--uds", socket)
 
+	// Add chat template if available in the model bundle.
+	// Since transformers v4.44, vLLM no longer provides a default chat
+	// template so we must supply one when the tokenizer omits it.
+	if path := bundle.ChatTemplatePath(); path != "" {
+		args = append(args, "--chat-template", path)
+	}
+
 	// Add mode-specific arguments
 	switch mode {
 	case inference.BackendModeCompletion:
diff --git a/pkg/inference/backends/vllm/vllm_config_test.go b/pkg/inference/backends/vllm/vllm_config_test.go
@@ -8,8 +8,9 @@ import (
 )
 
 type mockModelBundle struct {
-	safetensorsPath string
-	runtimeConfig   *types.Config
+	safetensorsPath  string
+	chatTemplatePath string
+	runtimeConfig    *types.Config
 }
 
 func (m *mockModelBundle) GGUFPath() string {
@@ -21,7 +22,7 @@ func (m *mockModelBundle) SafetensorsPath() string {
 }
 
 func (m *mockModelBundle) ChatTemplatePath() string {
-	return ""
+	return m.chatTemplatePath
 }
 
 func (m *mockModelBundle) MMPROJPath() string {
@@ -74,6 +75,36 @@ func TestGetArgs(t *testing.T) {
 				"/tmp/socket",
 			},
 		},
+		{
+			name: "with chat template",
+			bundle: &mockModelBundle{
+				safetensorsPath:  "/path/to/model",
+				chatTemplatePath: "/path/to/bundle/template.jinja",
+			},
+			config: nil,
+			expected: []string{
+				"serve",
+				"/path/to",
+				"--uds",
+				"/tmp/socket",
+				"--chat-template",
+				"/path/to/bundle/template.jinja",
+			},
+		},
+		{
+			name: "without chat template omits flag",
+			bundle: &mockModelBundle{
+				safetensorsPath:  "/path/to/model",
+				chatTemplatePath: "",
+			},
+			config: nil,
+			expected: []string{
+				"serve",
+				"/path/to",
+				"--uds",
+				"/tmp/socket",
+			},
+		},
 		{
 			name: "with backend context size",
 			bundle: &mockModelBundle{
@@ -499,6 +530,158 @@ func TestGetMaxModelLen(t *testing.T) {
 	}
 }
 
+func TestBuildArgs(t *testing.T) {
+	tests := []struct {
+		name        string
+		bundle      *mockModelBundle
+		socket      string
+		model       string
+		modelRef    string
+		mode        inference.BackendMode
+		config      *inference.BackendConfiguration
+		expected    []string
+		expectError bool
+	}{
+		{
+			name: "basic completion mode",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/models/bundle/model/safetensors",
+			},
+			socket:   "127.0.0.1:30000",
+			model:    "sha256:abc123",
+			modelRef: "ai/test-model:latest",
+			mode:     inference.BackendModeCompletion,
+			expected: []string{
+				"-m", "vllm.entrypoints.openai.api_server",
+				"--model", "/models/bundle/model",
+				"--host", "127.0.0.1",
+				"--port", "30000",
+				"--enable-auto-tool-choice", "--tool-call-parser", "hermes",
+				"--served-model-name", "sha256:abc123", "ai/test-model:latest",
+			},
+		},
+		{
+			name: "with chat template",
+			bundle: &mockModelBundle{
+				safetensorsPath:  "/models/bundle/model/safetensors",
+				chatTemplatePath: "/models/bundle/template.jinja",
+			},
+			socket:   "127.0.0.1:30000",
+			model:    "sha256:abc123",
+			modelRef: "ai/test-model:latest",
+			mode:     inference.BackendModeCompletion,
+			expected: []string{
+				"-m", "vllm.entrypoints.openai.api_server",
+				"--model", "/models/bundle/model",
+				"--host", "127.0.0.1",
+				"--port", "30000",
+				"--enable-auto-tool-choice", "--tool-call-parser", "hermes",
+				"--chat-template", "/models/bundle/template.jinja",
+				"--served-model-name", "sha256:abc123", "ai/test-model:latest",
+			},
+		},
+		{
+			name: "without chat template",
+			bundle: &mockModelBundle{
+				safetensorsPath:  "/models/bundle/model/safetensors",
+				chatTemplatePath: "",
+			},
+			socket:   "127.0.0.1:30000",
+			model:    "sha256:abc123",
+			modelRef: "ai/test-model:latest",
+			mode:     inference.BackendModeCompletion,
+			expected: []string{
+				"-m", "vllm.entrypoints.openai.api_server",
+				"--model", "/models/bundle/model",
+				"--host", "127.0.0.1",
+				"--port", "30000",
+				"--enable-auto-tool-choice", "--tool-call-parser", "hermes",
+				"--served-model-name", "sha256:abc123", "ai/test-model:latest",
+			},
+		},
+		{
+			name: "empty safetensors path should error",
+			bundle: &mockModelBundle{
+				safetensorsPath: "",
+			},
+			socket:      "127.0.0.1:30000",
+			model:       "sha256:abc123",
+			modelRef:    "ai/test-model:latest",
+			mode:        inference.BackendModeCompletion,
+			expectError: true,
+		},
+		{
+			name: "embedding mode",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/models/bundle/model/safetensors",
+			},
+			socket:   "127.0.0.1:30000",
+			model:    "sha256:abc123",
+			modelRef: "ai/test-model:latest",
+			mode:     inference.BackendModeEmbedding,
+			expected: []string{
+				"-m", "vllm.entrypoints.openai.api_server",
+				"--model", "/models/bundle/model",
+				"--host", "127.0.0.1",
+				"--port", "30000",
+				"--enable-auto-tool-choice", "--tool-call-parser", "hermes",
+				"--runner", "pooling",
+				"--served-model-name", "sha256:abc123", "ai/test-model:latest",
+			},
+		},
+		{
+			name: "with context size",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/models/bundle/model/safetensors",
+			},
+			socket:   "127.0.0.1:30000",
+			model:    "sha256:abc123",
+			modelRef: "ai/test-model:latest",
+			mode:     inference.BackendModeCompletion,
+			config: &inference.BackendConfiguration{
+				ContextSize: int32ptr(4096),
+			},
+			expected: []string{
+				"-m", "vllm.entrypoints.openai.api_server",
+				"--model", "/models/bundle/model",
+				"--host", "127.0.0.1",
+				"--port", "30000",
+				"--enable-auto-tool-choice", "--tool-call-parser", "hermes",
+				"--served-model-name", "sha256:abc123", "ai/test-model:latest",
+				"--max-model-len", "4096",
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			v := &vllmMetal{}
+			args, err := v.buildArgs(tt.bundle, tt.socket, tt.model, tt.modelRef, tt.mode, tt.config)
+
+			if tt.expectError {
+				if err == nil {
+					t.Fatalf("expected error but got none")
+				}
+				return
+			}
+
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+
+			if len(args) != len(tt.expected) {
+				t.Fatalf("expected %d args, got %d\nexpected: %v\ngot: %v", len(tt.expected), len(args), tt.expected, args)
+			}
+
+			for i, arg := range args {
+				if arg != tt.expected[i] {
+					t.Errorf("arg[%d]: expected %q, got %q", i, tt.expected[i], arg)
+				}
+			}
+		})
+	}
+}
+
 func int32ptr(n int32) *int32 {
 	return &n
 }
diff --git a/pkg/inference/backends/vllm/vllm_metal.go b/pkg/inference/backends/vllm/vllm_metal.go
@@ -13,6 +13,7 @@ import (
 	"strconv"
 	"strings"
 
+	"github.com/docker/model-runner/pkg/distribution/types"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/backends"
 	"github.com/docker/model-runner/pkg/inference/models"
@@ -252,7 +253,7 @@ func (v *vllmMetal) Run(ctx context.Context, socket, model string, modelRef stri
 // buildArgs builds the command line arguments for vllm-metal server.
 // vllm-metal is a vLLM platform plugin, so we launch vLLM's OpenAI-compatible
 // API server directly; the Metal plugin is auto-discovered via entry points.
-func (v *vllmMetal) buildArgs(bundle interface{ SafetensorsPath() string }, socket, model, modelRef string, mode inference.BackendMode, config *inference.BackendConfiguration) ([]string, error) {
+func (v *vllmMetal) buildArgs(bundle types.ModelBundle, socket, model, modelRef string, mode inference.BackendMode, config *inference.BackendConfiguration) ([]string, error) {
 	// Parse host:port from socket (vllm-metal uses TCP)
 	host, port, err := net.SplitHostPort(socket)
 	if err != nil {
@@ -274,6 +275,13 @@ func (v *vllmMetal) buildArgs(bundle interface{ SafetensorsPath() string }, sock
 		"--enable-auto-tool-choice", "--tool-call-parser", "hermes",
 	}
 
+	// Add chat template if available in the model bundle.
+	// Since transformers v4.44, vLLM no longer provides a default chat
+	// template so we must supply one when the tokenizer omits it.
+	if path := bundle.ChatTemplatePath(); path != "" {
+		args = append(args, "--chat-template", path)
+	}
+
 	// Add mode-specific arguments
 	switch mode {
 	case inference.BackendModeCompletion: