-
Notifications
You must be signed in to change notification settings - Fork 27
feat(observability): add fine-grained OTel spans for the plugin pipeline and model selector #165
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -24,17 +24,40 @@ import ( | |||||
|
|
||||||
| "github.com/go-logr/logr" | ||||||
| "go.opentelemetry.io/otel" | ||||||
| "go.opentelemetry.io/otel/attribute" | ||||||
| "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" | ||||||
| "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" | ||||||
| "go.opentelemetry.io/otel/propagation" | ||||||
| "go.opentelemetry.io/otel/sdk/resource" | ||||||
| sdktrace "go.opentelemetry.io/otel/sdk/trace" | ||||||
| semconv "go.opentelemetry.io/otel/semconv/v1.37.0" | ||||||
| "go.opentelemetry.io/otel/trace" | ||||||
|
|
||||||
| "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/observability/logging" | ||||||
| "github.com/llm-d/llm-d-inference-payload-processor/version" | ||||||
| ) | ||||||
|
|
||||||
| // instrumentationName is the default OTel instrumentation scope used when no | ||||||
| // explicit scope is supplied to Tracer. | ||||||
| const instrumentationName = "llm-d-inference-payload-processor" | ||||||
|
|
||||||
| // Tracer returns a tracer for the given instrumentation scope, defaulting to | ||||||
| // "llm-d-inference-payload-processor". The build version and commit SHA are | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| // attached so every span in a trace carries consistent scope metadata. | ||||||
| func Tracer(scope ...string) trace.Tracer { | ||||||
| name := instrumentationName | ||||||
| if len(scope) > 0 && scope[0] != "" { | ||||||
| name = scope[0] | ||||||
| } | ||||||
| return otel.Tracer( | ||||||
| name, | ||||||
| trace.WithInstrumentationVersion(version.BuildRef), | ||||||
| trace.WithInstrumentationAttributes( | ||||||
| attribute.String("commit-sha", version.CommitSHA), | ||||||
| ), | ||||||
| ) | ||||||
| } | ||||||
|
|
||||||
| type errorHandler struct { | ||||||
| logger logr.Logger | ||||||
| } | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| /* | ||
| Copyright 2026 The llm-d Authors. | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| */ | ||
|
|
||
| package tracing | ||
|
|
||
| import ( | ||
| "context" | ||
| "testing" | ||
|
|
||
| "go.opentelemetry.io/otel" | ||
| "go.opentelemetry.io/otel/attribute" | ||
| sdktrace "go.opentelemetry.io/otel/sdk/trace" | ||
| "go.opentelemetry.io/otel/sdk/trace/tracetest" | ||
|
|
||
| "github.com/llm-d/llm-d-inference-payload-processor/version" | ||
| ) | ||
|
|
||
| func TestTracer(t *testing.T) { | ||
| const ( | ||
| testBuildRef = "test-build-ref" | ||
| testCommitSHA = "test-commit-sha" | ||
| ) | ||
|
|
||
| origBuildRef, origCommitSHA := version.BuildRef, version.CommitSHA | ||
| version.BuildRef, version.CommitSHA = testBuildRef, testCommitSHA | ||
| t.Cleanup(func() { | ||
| version.BuildRef, version.CommitSHA = origBuildRef, origCommitSHA | ||
| }) | ||
|
|
||
| recorder := tracetest.NewSpanRecorder() | ||
| tp := sdktrace.NewTracerProvider(sdktrace.WithSpanProcessor(recorder)) | ||
| origTP := otel.GetTracerProvider() | ||
| otel.SetTracerProvider(tp) | ||
| t.Cleanup(func() { otel.SetTracerProvider(origTP) }) | ||
|
|
||
| tests := []struct { | ||
| name string | ||
| scope []string | ||
| wantScope string | ||
| }{ | ||
| {name: "default scope", scope: nil, wantScope: instrumentationName}, | ||
| {name: "custom scope", scope: []string{"llm-d-inference-payload-processor/pkg/handlers"}, wantScope: "llm-d-inference-payload-processor/pkg/handlers"}, | ||
| {name: "empty scope falls back to default", scope: []string{""}, wantScope: instrumentationName}, | ||
| } | ||
|
|
||
| for _, tc := range tests { | ||
| t.Run(tc.name, func(t *testing.T) { | ||
| recorder.Reset() | ||
|
|
||
| _, span := Tracer(tc.scope...).Start(context.Background(), "test-span") | ||
| span.End() | ||
|
|
||
| ended := recorder.Ended() | ||
| if len(ended) != 1 { | ||
| t.Fatalf("expected 1 recorded span, got %d", len(ended)) | ||
| } | ||
|
|
||
| scope := ended[0].InstrumentationScope() | ||
| if scope.Name != tc.wantScope { | ||
| t.Errorf("scope name = %q, want %q", scope.Name, tc.wantScope) | ||
| } | ||
| if scope.Version != testBuildRef { | ||
| t.Errorf("scope version = %q, want %q", scope.Version, testBuildRef) | ||
| } | ||
|
|
||
| commitSHA, ok := scope.Attributes.Value(attribute.Key("commit-sha")) | ||
| if !ok { | ||
| t.Fatal("commit-sha scope attribute not set") | ||
| } | ||
| if commitSHA.AsString() != testCommitSHA { | ||
| t.Errorf("commit-sha = %q, want %q", commitSHA.AsString(), testCommitSHA) | ||
| } | ||
| }) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,11 +24,15 @@ import ( | |
| "time" | ||
|
|
||
| eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" | ||
| "go.opentelemetry.io/otel/attribute" | ||
| "go.opentelemetry.io/otel/codes" | ||
| "go.opentelemetry.io/otel/trace" | ||
| "sigs.k8s.io/controller-runtime/pkg/log" | ||
|
|
||
| envoy "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/envoy" | ||
| errcommon "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/error" | ||
| logutil "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/observability/logging" | ||
| "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/observability/tracing" | ||
| datasource "github.com/llm-d/llm-d-inference-payload-processor/pkg/framework/interface/datalayer/datasource" | ||
| "github.com/llm-d/llm-d-inference-payload-processor/pkg/framework/interface/plugin" | ||
| "github.com/llm-d/llm-d-inference-payload-processor/pkg/framework/interface/requesthandling" | ||
|
|
@@ -131,17 +135,34 @@ func (s *Server) runRequestPlugins(ctx context.Context, cycleState *plugin.Cycle | |
| verboseLogger := logger.V(logutil.VERBOSE) | ||
| verboseEnabled := verboseLogger.Enabled() | ||
|
|
||
| // Stage span grouping the per-plugin spans under gateway.request. | ||
| tracer := tracing.Tracer(handlersTracerScope) | ||
| ctx, stageSpan := tracer.Start(ctx, "request_plugins", trace.WithSpanKind(trace.SpanKindInternal)) | ||
| defer stageSpan.End() | ||
|
|
||
| for _, reqPlugin := range reqPlugins { | ||
| typedName := reqPlugin.TypedName() | ||
| if verboseEnabled { | ||
| verboseLogger.Info("Executing request plugin", "plugin", reqPlugin.TypedName()) | ||
| verboseLogger.Info("Executing request plugin", "plugin", typedName) | ||
| } | ||
| pluginCtx, span := tracer.Start(ctx, "plugin."+typedName.Type, | ||
| trace.WithSpanKind(trace.SpanKindInternal), | ||
| trace.WithAttributes( | ||
| attribute.String("llm_d.plugin.extension_point", requestPluginExtensionPoint), | ||
| attribute.String("llm_d.plugin.type", typedName.Type), | ||
| attribute.String("llm_d.plugin.name", typedName.Name), | ||
| )) | ||
| before := time.Now() | ||
| err := reqPlugin.ProcessRequest(ctx, cycleState, request) | ||
| metrics.RecordPluginProcessingLatency(requestPluginExtensionPoint, reqPlugin.TypedName().Type, reqPlugin.TypedName().Name, time.Since(before)) | ||
| err := reqPlugin.ProcessRequest(pluginCtx, cycleState, request) | ||
| metrics.RecordPluginProcessingLatency(requestPluginExtensionPoint, typedName.Type, typedName.Name, time.Since(before)) | ||
| if err != nil { | ||
| logger.Error(err, "Failed to execute request plugin", "plugin", reqPlugin.TypedName()) | ||
| span.RecordError(err) | ||
| span.SetStatus(codes.Error, err.Error()) | ||
| span.End() | ||
| logger.Error(err, "Failed to execute request plugin", "plugin", typedName) | ||
| return err | ||
| } | ||
| span.End() | ||
| } | ||
|
|
||
| return nil | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems that we may want to add more test coverage for these changes: We need a test that does this:
If I'm not mistaken, I don't believe we have that currently which would leave an opportunity for regressions.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll add a |
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -24,8 +24,6 @@ import ( | |||||
|
|
||||||
| extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" | ||||||
| "github.com/go-logr/logr" | ||||||
| "go.opentelemetry.io/otel" | ||||||
| "go.opentelemetry.io/otel/attribute" | ||||||
| "go.opentelemetry.io/otel/trace" | ||||||
| "google.golang.org/grpc/codes" | ||||||
| "google.golang.org/grpc/status" | ||||||
|
|
@@ -34,11 +32,11 @@ import ( | |||||
| envoy "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/envoy" | ||||||
| errcommon "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/error" | ||||||
| logutil "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/observability/logging" | ||||||
| "github.com/llm-d/llm-d-inference-payload-processor/pkg/common/observability/tracing" | ||||||
| datasource "github.com/llm-d/llm-d-inference-payload-processor/pkg/framework/interface/datalayer/datasource" | ||||||
| "github.com/llm-d/llm-d-inference-payload-processor/pkg/framework/interface/plugin" | ||||||
| "github.com/llm-d/llm-d-inference-payload-processor/pkg/framework/interface/requesthandling" | ||||||
| "github.com/llm-d/llm-d-inference-payload-processor/pkg/metrics" | ||||||
| "github.com/llm-d/llm-d-inference-payload-processor/version" | ||||||
| ) | ||||||
|
|
||||||
| const ( | ||||||
|
|
@@ -47,6 +45,10 @@ const ( | |||||
|
|
||||||
| requestPluginExtensionPoint = "request" | ||||||
| responsePluginExtensionPoint = "response" | ||||||
|
|
||||||
| // handlersTracerScope is the OTel instrumentation scope for spans emitted by | ||||||
| // the request/response handlers, following the package-path naming convention. | ||||||
| handlersTracerScope = "llm-d-inference-payload-processor/pkg/handlers" | ||||||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @shmuelk with your above comment, do we need to update here as well to
Suggested change
|
||||||
| ) | ||||||
|
|
||||||
| func NewServer(profilePicker requesthandling.ProfilePicker, profiles map[string]*requesthandling.Profile) *Server { | ||||||
|
|
@@ -99,14 +101,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { | |||||
| ctx := srv.Context() | ||||||
|
|
||||||
| // Start tracing span for the request | ||||||
| tracer := otel.Tracer( | ||||||
| "llm-d-inference-payload-processor/pkg/handlers", | ||||||
| trace.WithInstrumentationVersion(version.BuildRef), | ||||||
| trace.WithInstrumentationAttributes( | ||||||
| attribute.String("commit-sha", version.CommitSHA), | ||||||
| ), | ||||||
| ) | ||||||
| ctx, span := tracer.Start(ctx, "gateway.request", trace.WithSpanKind(trace.SpanKindServer)) | ||||||
| ctx, span := tracing.Tracer(handlersTracerScope).Start(ctx, "gateway.request", trace.WithSpanKind(trace.SpanKindServer)) | ||||||
| defer span.End() | ||||||
|
|
||||||
| logger := log.FromContext(ctx) | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I recommend shortning the name. How about: