From aaacee416076a1ae25691a35b1f56be0fd4bf7cf Mon Sep 17 00:00:00 2001 From: Piotr Stankiewicz Date: Thu, 5 Jun 2025 13:56:16 +0200 Subject: [PATCH 1/3] Return error in case of runner crash In case the runner crashes it would be nice to return an error to the user. So, add an error handler on the proxy and use it to try to figure out if the runner crashed and format the response error accordingly. Signed-off-by: Piotr Stankiewicz --- pkg/inference/scheduling/api.go | 10 ++++++++++ pkg/inference/scheduling/runner.go | 14 ++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go index ebd40571d..52c8ec56d 100644 --- a/pkg/inference/scheduling/api.go +++ b/pkg/inference/scheduling/api.go @@ -44,6 +44,16 @@ type OpenAIInferenceRequest struct { Model string `json:"model"` } +// OpenAIErrorResponse is used to format an OpenAI API compatible error response +// (see https://platform.openai.com/docs/api-reference/responses-streaming/error) +type OpenAIErrorResponse struct { + Type string `json:"type"` // always "error" + Code *string `json:"code"` + Message string `json:"message"` + Param *string `json:"param"` + SequenceNumber int `json:"sequence_number"` +} + // BackendStatus represents information about a running backend type BackendStatus struct { // BackendName is the name of the backend diff --git a/pkg/inference/scheduling/runner.go b/pkg/inference/scheduling/runner.go index 43f28e48e..7d06b6d4d 100644 --- a/pkg/inference/scheduling/runner.go +++ b/pkg/inference/scheduling/runner.go @@ -2,6 +2,7 @@ package scheduling import ( "context" + "encoding/json" "errors" "fmt" "io" @@ -143,11 +144,24 @@ func run( w.WriteHeader(http.StatusInternalServerError) select { case <-r.done: + res := OpenAIErrorResponse{ + Type: "error", + Code: nil, + Message: r.err.Error(), + Param: nil, + SequenceNumber: 1, + } + errJson, err := json.Marshal(&res) + if err == nil { + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.Write(errJson) + } return case <-time.After(30 * time.Second): } } else { w.WriteHeader(http.StatusBadGateway) + } } From 234d5aa387e474fb5293336fc1b28802f6b72385 Mon Sep 17 00:00:00 2001 From: Piotr Stankiewicz Date: Tue, 17 Jun 2025 16:14:58 +0200 Subject: [PATCH 2/3] Fix stall in case a runner crashes while not in active use Signed-off-by: Piotr Stankiewicz --- pkg/inference/scheduling/loader.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go index 536bd80ff..48c3895de 100644 --- a/pkg/inference/scheduling/loader.go +++ b/pkg/inference/scheduling/loader.go @@ -408,7 +408,11 @@ func (l *loader) load(ctx context.Context, backendName, model string, mode infer select { case <-l.slots[existing].done: l.log.Warnf("%s runner for %s is defunct. Waiting for it to be evicted.", backendName, model) - goto WaitForChange + if l.references[existing] == 0 { + l.evictRunner(backendName, model, mode) + } else { + goto WaitForChange + } default: l.references[existing] += 1 l.timestamps[existing] = time.Time{} From 5984f8a9a90b3746ba6c20e6265e1af8dfd8bf6e Mon Sep 17 00:00:00 2001 From: Piotr Date: Tue, 17 Jun 2025 16:52:35 +0200 Subject: [PATCH 3/3] Update pkg/inference/scheduling/runner.go Co-authored-by: Dorin-Andrei Geman --- pkg/inference/scheduling/runner.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/inference/scheduling/runner.go b/pkg/inference/scheduling/runner.go index 7d06b6d4d..15de5fec4 100644 --- a/pkg/inference/scheduling/runner.go +++ b/pkg/inference/scheduling/runner.go @@ -161,7 +161,6 @@ func run( } } else { w.WriteHeader(http.StatusBadGateway) - } }