feat(usage): expose first-byte latency in request stats

NGLSL · NGLSL · commit 3a4bfcf5d035 · 2026-04-17T03:12:37.000+08:00
diff --git a/internal/runtime/executor/helps/logging_helpers.go b/internal/runtime/executor/helps/logging_helpers.go
@@ -101,10 +101,13 @@ func RecordAPIRequest(ctx context.Context, cfg *config.Config, info UpstreamRequ
 
 // RecordAPIResponseMetadata captures upstream response status/header information for the latest attempt.
 func RecordAPIResponseMetadata(ctx context.Context, cfg *config.Config, status int, headers http.Header) {
+	ginCtx := ginContextFrom(ctx)
+	if ginCtx != nil && !isStreamingResponseHeaders(headers) {
+		markAPIResponseTimestamp(ginCtx)
+	}
 	if cfg == nil || !cfg.RequestLog {
 		return
 	}
-	ginCtx := ginContextFrom(ctx)
 	if ginCtx == nil {
 		return
 	}
@@ -125,7 +128,13 @@ func RecordAPIResponseMetadata(ctx context.Context, cfg *config.Config, status i
 	updateAggregatedResponse(ginCtx, attempts)
 }
 
-// RecordAPIResponseError adds an error entry for the latest attempt when no HTTP response is available.
+func isStreamingResponseHeaders(headers http.Header) bool {
+	if len(headers) == 0 {
+		return false
+	}
+	return strings.Contains(strings.ToLower(strings.TrimSpace(headers.Get("Content-Type"))), "text/event-stream")
+}
+
 func RecordAPIResponseError(ctx context.Context, cfg *config.Config, err error) {
 	if cfg == nil || !cfg.RequestLog || err == nil {
 		return
@@ -152,14 +161,17 @@ func RecordAPIResponseError(ctx context.Context, cfg *config.Config, err error)
 
 // AppendAPIResponseChunk appends an upstream response chunk to Gin context for request logging.
 func AppendAPIResponseChunk(ctx context.Context, cfg *config.Config, chunk []byte) {
-	if cfg == nil || !cfg.RequestLog {
-		return
-	}
 	data := bytes.TrimSpace(chunk)
 	if len(data) == 0 {
 		return
 	}
 	ginCtx := ginContextFrom(ctx)
+	if ginCtx != nil {
+		markAPIResponseTimestamp(ginCtx)
+	}
+	if cfg == nil || !cfg.RequestLog {
+		return
+	}
 	if ginCtx == nil {
 		return
 	}
@@ -283,18 +295,20 @@ func WebsocketUpgradeRequestURL(rawURL string) string {
 
 // AppendAPIWebsocketResponse stores an upstream websocket response frame in Gin context.
 func AppendAPIWebsocketResponse(ctx context.Context, cfg *config.Config, payload []byte) {
-	if cfg == nil || !cfg.RequestLog {
-		return
-	}
 	data := bytes.TrimSpace(payload)
 	if len(data) == 0 {
 		return
 	}
 	ginCtx := ginContextFrom(ctx)
+	if ginCtx != nil {
+		markAPIResponseTimestamp(ginCtx)
+	}
+	if cfg == nil || !cfg.RequestLog {
+		return
+	}
 	if ginCtx == nil {
 		return
 	}
-	markAPIResponseTimestamp(ginCtx)
 
 	builder := &strings.Builder{}
 	builder.WriteString(fmt.Sprintf("Timestamp: %s\n", time.Now().Format(time.RFC3339Nano)))
@@ -307,14 +321,16 @@ func AppendAPIWebsocketResponse(ctx context.Context, cfg *config.Config, payload
 
 // RecordAPIWebsocketError stores an upstream websocket error event in Gin context.
 func RecordAPIWebsocketError(ctx context.Context, cfg *config.Config, stage string, err error) {
+	ginCtx := ginContextFrom(ctx)
+	if ginCtx != nil {
+		markAPIResponseTimestamp(ginCtx)
+	}
 	if cfg == nil || !cfg.RequestLog || err == nil {
 		return
 	}
-	ginCtx := ginContextFrom(ctx)
 	if ginCtx == nil {
 		return
 	}
-	markAPIResponseTimestamp(ginCtx)
 
 	builder := &strings.Builder{}
 	builder.WriteString(fmt.Sprintf("Timestamp: %s\n", time.Now().Format(time.RFC3339Nano)))
@@ -328,6 +344,9 @@ func RecordAPIWebsocketError(ctx context.Context, cfg *config.Config, stage stri
 }
 
 func ginContextFrom(ctx context.Context) *gin.Context {
+	if ctx == nil {
+		return nil
+	}
 	ginCtx, _ := ctx.Value("gin").(*gin.Context)
 	return ginCtx
 }
diff --git a/internal/runtime/executor/helps/usage_helpers.go b/internal/runtime/executor/helps/usage_helpers.go
@@ -70,7 +70,7 @@ func (r *UsageReporter) publishWithOutcome(ctx context.Context, detail usage.Det
 		}
 	}
 	r.once.Do(func() {
-		usage.PublishRecord(ctx, r.buildRecord(detail, failed))
+		usage.PublishRecord(ctx, r.buildRecord(ctx, detail, failed))
 	})
 }
 
@@ -83,25 +83,26 @@ func (r *UsageReporter) EnsurePublished(ctx context.Context) {
 		return
 	}
 	r.once.Do(func() {
-		usage.PublishRecord(ctx, r.buildRecord(usage.Detail{}, false))
+		usage.PublishRecord(ctx, r.buildRecord(ctx, usage.Detail{}, false))
 	})
 }
 
-func (r *UsageReporter) buildRecord(detail usage.Detail, failed bool) usage.Record {
+func (r *UsageReporter) buildRecord(ctx context.Context, detail usage.Detail, failed bool) usage.Record {
 	if r == nil {
 		return usage.Record{Detail: detail, Failed: failed}
 	}
 	return usage.Record{
-		Provider:    r.provider,
-		Model:       r.model,
-		Source:      r.source,
-		APIKey:      r.apiKey,
-		AuthID:      r.authID,
-		AuthIndex:   r.authIndex,
-		RequestedAt: r.requestedAt,
-		Latency:     r.latency(),
-		Failed:      failed,
-		Detail:      detail,
+		Provider:         r.provider,
+		Model:            r.model,
+		Source:           r.source,
+		APIKey:           r.apiKey,
+		AuthID:           r.authID,
+		AuthIndex:        r.authIndex,
+		RequestedAt:      r.requestedAt,
+		Latency:          r.latency(),
+		FirstByteLatency: r.firstByteLatency(ctx),
+		Failed:           failed,
+		Detail:           detail,
 	}
 }
 
@@ -116,6 +117,40 @@ func (r *UsageReporter) latency() time.Duration {
 	return latency
 }
 
+func (r *UsageReporter) firstByteLatency(ctx context.Context) time.Duration {
+	if r == nil || r.requestedAt.IsZero() {
+		return 0
+	}
+	responseTimestamp := apiResponseTimestampFromContext(ctx)
+	if responseTimestamp.IsZero() {
+		return 0
+	}
+	latency := responseTimestamp.Sub(r.requestedAt)
+	if latency <= 0 {
+		return 0
+	}
+	return latency
+}
+
+func apiResponseTimestampFromContext(ctx context.Context) time.Time {
+	if ctx == nil {
+		return time.Time{}
+	}
+	ginCtx, ok := ctx.Value("gin").(*gin.Context)
+	if !ok || ginCtx == nil {
+		return time.Time{}
+	}
+	value, exists := ginCtx.Get("API_RESPONSE_TIMESTAMP")
+	if !exists {
+		return time.Time{}
+	}
+	timestamp, ok := value.(time.Time)
+	if !ok {
+		return time.Time{}
+	}
+	return timestamp
+}
+
 func APIKeyFromContext(ctx context.Context) string {
 	if ctx == nil {
 		return ""
diff --git a/internal/runtime/executor/helps/usage_helpers_test.go b/internal/runtime/executor/helps/usage_helpers_test.go
@@ -1,9 +1,12 @@
 package helps
 
 import (
+	"context"
+	"net/http/httptest"
 	"testing"
 	"time"
 
+	"github.com/gin-gonic/gin"
 	"github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/usage"
 )
 
@@ -54,11 +57,36 @@ func TestUsageReporterBuildRecordIncludesLatency(t *testing.T) {
 		requestedAt: time.Now().Add(-1500 * time.Millisecond),
 	}
 
-	record := reporter.buildRecord(usage.Detail{TotalTokens: 3}, false)
+	record := reporter.buildRecord(context.Background(), usage.Detail{TotalTokens: 3}, false)
 	if record.Latency < time.Second {
 		t.Fatalf("latency = %v, want >= 1s", record.Latency)
 	}
 	if record.Latency > 3*time.Second {
 		t.Fatalf("latency = %v, want <= 3s", record.Latency)
 	}
+	if record.FirstByteLatency != 0 {
+		t.Fatalf("first byte latency = %v, want 0", record.FirstByteLatency)
+	}
+}
+
+func TestUsageReporterBuildRecordIncludesFirstByteLatency(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	recorder := httptest.NewRecorder()
+	ginCtx, _ := gin.CreateTestContext(recorder)
+	requestedAt := time.Now().Add(-1500 * time.Millisecond)
+	firstByteAt := requestedAt.Add(250 * time.Millisecond)
+	ginCtx.Set("API_RESPONSE_TIMESTAMP", firstByteAt)
+
+	ctx := context.WithValue(context.Background(), "gin", ginCtx)
+	reporter := &UsageReporter{
+		provider:    "openai",
+		model:       "gpt-5.4",
+		requestedAt: requestedAt,
+	}
+
+	record := reporter.buildRecord(ctx, usage.Detail{TotalTokens: 3}, false)
+	if record.FirstByteLatency != 250*time.Millisecond {
+		t.Fatalf("first byte latency = %v, want 250ms", record.FirstByteLatency)
+	}
 }
diff --git a/internal/usage/logger_plugin.go b/internal/usage/logger_plugin.go
@@ -89,12 +89,13 @@ type modelStats struct {
 
 // RequestDetail stores the timestamp, latency, and token usage for a single request.
 type RequestDetail struct {
-	Timestamp time.Time  `json:"timestamp"`
-	LatencyMs int64      `json:"latency_ms"`
-	Source    string     `json:"source"`
-	AuthIndex string     `json:"auth_index"`
-	Tokens    TokenStats `json:"tokens"`
-	Failed    bool       `json:"failed"`
+	Timestamp          time.Time  `json:"timestamp"`
+	LatencyMs          int64      `json:"latency_ms"`
+	FirstByteLatencyMs *int64     `json:"first_byte_latency_ms,omitempty"`
+	Source             string     `json:"source"`
+	AuthIndex          string     `json:"auth_index"`
+	Tokens             TokenStats `json:"tokens"`
+	Failed             bool       `json:"failed"`
 }
 
 // TokenStats captures the token usage breakdown for a request.
@@ -198,12 +199,13 @@ func (s *RequestStatistics) Record(ctx context.Context, record coreusage.Record)
 		s.apis[statsKey] = stats
 	}
 	s.updateAPIStats(stats, modelName, RequestDetail{
-		Timestamp: timestamp,
-		LatencyMs: normaliseLatency(record.Latency),
-		Source:    record.Source,
-		AuthIndex: record.AuthIndex,
-		Tokens:    detail,
-		Failed:    failed,
+		Timestamp:          timestamp,
+		LatencyMs:          normaliseLatency(record.Latency),
+		FirstByteLatencyMs: normaliseOptionalLatency(record.FirstByteLatency),
+		Source:             record.Source,
+		AuthIndex:          record.AuthIndex,
+		Tokens:             detail,
+		Failed:             failed,
 	})
 
 	s.requestsByDay[dayKey]++
@@ -475,6 +477,14 @@ func normaliseLatency(latency time.Duration) int64 {
 	return latency.Milliseconds()
 }
 
+func normaliseOptionalLatency(latency time.Duration) *int64 {
+	if latency <= 0 {
+		return nil
+	}
+	value := latency.Milliseconds()
+	return &value
+}
+
 func formatHour(hour int) string {
 	if hour < 0 {
 		hour = 0
diff --git a/internal/usage/logger_plugin_test.go b/internal/usage/logger_plugin_test.go
@@ -30,6 +30,36 @@ func TestRequestStatisticsRecordIncludesLatency(t *testing.T) {
 	if details[0].LatencyMs != 1500 {
 		t.Fatalf("latency_ms = %d, want 1500", details[0].LatencyMs)
 	}
+	if details[0].FirstByteLatencyMs != nil {
+		t.Fatalf("first_byte_latency_ms = %v, want nil", *details[0].FirstByteLatencyMs)
+	}
+}
+
+func TestRequestStatisticsRecordIncludesFirstByteLatency(t *testing.T) {
+	stats := NewRequestStatistics()
+	stats.Record(context.Background(), coreusage.Record{
+		APIKey:           "test-key",
+		Model:            "gpt-5.4",
+		RequestedAt:      time.Date(2026, 3, 20, 12, 0, 0, 0, time.UTC),
+		FirstByteLatency: 250 * time.Millisecond,
+		Detail: coreusage.Detail{
+			InputTokens:  10,
+			OutputTokens: 20,
+			TotalTokens:  30,
+		},
+	})
+
+	snapshot := stats.Snapshot()
+	details := snapshot.APIs["test-key"].Models["gpt-5.4"].Details
+	if len(details) != 1 {
+		t.Fatalf("details len = %d, want 1", len(details))
+	}
+	if details[0].FirstByteLatencyMs == nil {
+		t.Fatal("first_byte_latency_ms = nil, want value")
+	}
+	if *details[0].FirstByteLatencyMs != 250 {
+		t.Fatalf("first_byte_latency_ms = %d, want 250", *details[0].FirstByteLatencyMs)
+	}
 }
 
 func TestRequestStatisticsMergeSnapshotDedupIgnoresLatency(t *testing.T) {
diff --git a/sdk/cliproxy/usage/manager.go b/sdk/cliproxy/usage/manager.go
@@ -10,16 +10,17 @@ import (
 
 // Record contains the usage statistics captured for a single provider request.
 type Record struct {
-	Provider    string
-	Model       string
-	APIKey      string
-	AuthID      string
-	AuthIndex   string
-	Source      string
-	RequestedAt time.Time
-	Latency     time.Duration
-	Failed      bool
-	Detail      Detail
+	Provider         string
+	Model            string
+	APIKey           string
+	AuthID           string
+	AuthIndex        string
+	Source           string
+	RequestedAt      time.Time
+	Latency          time.Duration
+	FirstByteLatency time.Duration
+	Failed           bool
+	Detail           Detail
 }
 
 // Detail holds the token usage breakdown.