Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 78 additions & 4 deletions pkg/handlers/response.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
package handlers

import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"strconv"
"time"
Expand All @@ -44,9 +46,9 @@

if !headers.GetEndOfStream() {
log.FromContext(ctx).V(logutil.VERBOSE).Info("captured response headers, deferring response until body arrives...")
return nil
}
// EndOfStream means no body is expected, return HeadersResponse immediately
// Always respond to response headers so Envoy proceeds with body chunks.
// In STREAMED/FULL_DUPLEX_STREAMED mode, Envoy blocks until we respond.
return []*eppb.ProcessingResponse{
{
Response: &eppb.ProcessingResponse_ResponseHeaders{
Expand All @@ -64,8 +66,15 @@
}

if err := json.Unmarshal(responseBodyBytes, &reqCtx.Response.Body); err != nil {
logger.Error(err, "Failed to parse response body as JSON, skipping response plugins")
return s.generateEmptyResponseBodyResponse(responseBodyBytes), nil
// Try parsing as SSE (Server-Sent Events) — streaming responses from providers
// like Anthropic use SSE format which isn't valid JSON.
if sseBody, sseErr := parseSSEResponseBody(responseBodyBytes); sseErr == nil && sseBody != nil {
reqCtx.Response.Body = sseBody
logger.V(logutil.VERBOSE).Info("parsed SSE response body for response plugins")
} else {
logger.Error(err, "Failed to parse response body as JSON or SSE, skipping response plugins")
return s.generateEmptyResponseBodyResponse(responseBodyBytes), nil
}
}

if err := s.runResponsePlugins(ctx, reqCtx.CycleState, reqCtx.Response); err != nil {
Expand Down Expand Up @@ -130,6 +139,71 @@
}, nil
}

// parseSSEResponseBody extracts a composite response body from an SSE (Server-Sent Events)
// stream. It scans all "data:" lines for JSON objects and merges usage/model fields into
// a single map that response plugins can process. This enables usage-tracking and metering
// plugins to work with streaming responses from providers like Anthropic and OpenAI.
// parseSSEResponseBody extracts a composite response body from an SSE (Server-Sent Events)
// stream. It parses by SSE event boundaries instead of individual lines because one logical
// event may legally contain multiple consecutive `data:` lines that must be joined before JSON decoding.
func parseSSEResponseBody(body []byte) (map[string]any, error) {
result := map[string]any{}
lines := bytes.Split(body, []byte("\n"))

Check failure on line 151 in pkg/handlers/response.go

View workflow job for this annotation

GitHub Actions / lint-and-test

declared and not used: lines

Check failure on line 151 in pkg/handlers/response.go

View workflow job for this annotation

GitHub Actions / lint-and-test

declared and not used: lines
eventDataLines := make([][]byte, 0)

// flushEvent keeps the SSE framing logic local to this parser because the bug happens
// exactly at event boundaries: we must join all `data:` lines for one event before parsing.
flushEvent := func() {

Check failure on line 156 in pkg/handlers/response.go

View workflow job for this annotation

GitHub Actions / lint-and-test

declared and not used: flushEvent) (typecheck)

Check failure on line 156 in pkg/handlers/response.go

View workflow job for this annotation

GitHub Actions / lint-and-test

declared and not used: flushEvent (typecheck)
if len(eventDataLines) == 0 {
return
}

data := bytes.Join(eventDataLines, []byte("\n"))
eventDataLines = eventDataLines[:0]

data = bytes.TrimSpace(data)
if len(data) == 0 || bytes.Equal(data, []byte("[DONE]")) {
return
}

var event map[string]any
if err := json.Unmarshal(data, &event); err != nil {
return
}

if model, ok := event["model"].(string); ok && model != "" {
result["model"] = model
}
Comment thread
davidbreitgand marked this conversation as resolved.

// Check for usage at top level (Anthropic) or nested in response (OpenAI Responses API)
usage, _ := event["usage"].(map[string]any)
if usage == nil {
if resp, ok := event["response"].(map[string]any); ok {
usage, _ = resp["usage"].(map[string]any)
if m, ok := resp["model"].(string); ok && m != "" {
result["model"] = m
}
}
}
if usage != nil {
existing, _ := result["usage"].(map[string]any)
if existing == nil {
existing = map[string]any{}
}
for k, v := range usage {
existing[k] = v
}
result["usage"] = existing
}
}

if len(result) == 0 {
return nil, errors.New("no parseable SSE data events found")
}

return result, nil
}

// runResponsePlugins executes response plugins in the order they were registered.
func (s *Server) runResponsePlugins(ctx context.Context, cycleState *plugin.CycleState, response *requesthandling.InferenceResponse) error {
logger := log.FromContext(ctx).V(logutil.DEFAULT)
Expand Down
10 changes: 10 additions & 0 deletions pkg/handlers/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,16 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
}
responseBody = append(responseBody, v.ResponseBody.Body...)
if !v.ResponseBody.EndOfStream {
// Send an immediate response for this chunk so Envoy continues
// streaming. Without this, Envoy blocks waiting for our response
// and stops forwarding subsequent chunks.
if sendErr := srv.Send(&extProcPb.ProcessingResponse{
Response: &extProcPb.ProcessingResponse_ResponseBody{
ResponseBody: &extProcPb.BodyResponse{},
},
}); sendErr != nil {
return status.Errorf(codes.Unknown, "failed to send streaming response ack: %v", sendErr)
}
continue
}
reqCtx.ResponseCompleteTimestamp = time.Now()
Expand Down
10 changes: 10 additions & 0 deletions pkg/handlers/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,10 @@ func TestHandleResponseBody_Streaming(t *testing.T) {
if err := process.Send(request); err != nil {
t.Fatalf("send response headers: %v", err)
}
// Discard the immediate header ack (HandleResponseHeaders always responds now).
if _, err := process.Recv(); err != nil {
t.Fatalf("recv header ack: %v", err)
}

for _, c := range tc.chunks {
request = &extProcPb.ProcessingRequest{
Expand All @@ -190,6 +194,12 @@ func TestHandleResponseBody_Streaming(t *testing.T) {
if err := process.Send(request); err != nil {
t.Fatalf("send response body chunk: %v", err)
}
// Discard the immediate ack for non-EoS chunks.
if !c.endOfStream {
if _, err := process.Recv(); err != nil {
t.Fatalf("recv chunk ack: %v", err)
}
}
}

got := make([]*extProcPb.ProcessingResponse, 0, len(want))
Expand Down
Loading