Skip to content

Commit b2249a0

Browse files
committed
NCBC-4202: Establish FIT observability parity for Stellar SDK
Motivation ========== The FIT observability suite failed during trace validation because Stellar SDK spans (Protostellar) lacked the proper OpenTelemetry parent-child relationships and semantic attributes required by the test driver. Modifications ============= - Updated `StellarCollection`, `StellarQueryClient`, `StellarSearchClient`, and `StellarAnalyticsClient` to extract `RequestSpan` from options and properly pass it to the SDK's internal `IRequestTracer`. - Populated standard OpenTelemetry attributes (e.g., `db.system`, `db.couchbase.service`, `db.operation`, `db.name`) on the generated child spans to ensure full telemetry compliance. - Implemented span status mapping to capture exceptions securely within `StellarRetryHandler` and individual service flows. Result ====== The `kvReplace` and other related tests in the FIT observability suite now pass, correctly validating the trace hierarchy and semantic attributes. Change-Id: I531b38123d025f3919c1d3c9e3b894f6d25046b4 Reviewed-on: https://review.couchbase.org/c/couchbase-net-client/+/244958 Reviewed-by: Emilien Bevierre <emilien.bevierre@couchbase.com> Tested-by: Build Bot <build@couchbase.com>
1 parent ed9793d commit b2249a0

9 files changed

Lines changed: 277 additions & 43 deletions

File tree

src/Couchbase/Core/Diagnostics/Metrics/MetricTracker.cs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ static MetricTracker()
173173
/// </summary>
174174
private static class ModernAttributes
175175
{
176+
public const string SystemName = "db.system.name";
177+
public const string SystemValue = "couchbase";
176178
public const string Service = "couchbase.service";
177179
public const string Operation = "db.operation.name";
178180
public const string Namespace = "db.namespace";
@@ -210,6 +212,7 @@ public static void TrackOperation(OperationBase operation, TimeSpan duration, Ty
210212
// Modern metrics
211213
var modernTags = new TagList
212214
{
215+
{ ModernAttributes.SystemName, ModernAttributes.SystemValue },
213216
{ ModernAttributes.Service, OuterRequestSpans.ServiceSpan.Kv.Name },
214217
{ ModernAttributes.Operation, operation.OpCode.ToMetricTag() },
215218
{ ModernAttributes.Namespace, operation.BucketName },
@@ -316,6 +319,7 @@ public static void TrackOperation(QueryRequest queryRequest, TimeSpan duration,
316319
// Modern metrics
317320
var modernTags = new TagList
318321
{
322+
new(ModernAttributes.SystemName, ModernAttributes.SystemValue),
319323
new(ModernAttributes.Service, OuterRequestSpans.ServiceSpan.N1QLQuery),
320324
new(ModernAttributes.Operation, OuterRequestSpans.ServiceSpan.N1QLQuery),
321325
new(ModernAttributes.Namespace, queryRequest.Options?.BucketName),
@@ -351,6 +355,7 @@ public static void TrackOperation(AnalyticsRequest analyticsRequest, TimeSpan du
351355
// Modern metrics
352356
var modernTags = new TagList
353357
{
358+
new(ModernAttributes.SystemName, ModernAttributes.SystemValue),
354359
new(ModernAttributes.Service, OuterRequestSpans.ServiceSpan.AnalyticsQuery),
355360
new(ModernAttributes.Namespace, analyticsRequest.Options?.BucketName),
356361
new(ModernAttributes.ScopeName, analyticsRequest.Options?.ScopeName),
@@ -384,6 +389,7 @@ public static void TrackOperation(FtsSearchRequest searchRequest, TimeSpan durat
384389
// Modern metrics
385390
var modernTags = new TagList
386391
{
392+
new(ModernAttributes.SystemName, ModernAttributes.SystemValue),
387393
new(ModernAttributes.Service, OuterRequestSpans.ServiceSpan.SearchQuery),
388394
new(ModernAttributes.ScopeName, searchRequest.Options?.ScopeName),
389395
new(ModernAttributes.Outcome, GetOutcome(errorType))
@@ -484,6 +490,7 @@ public static void TrackOperation(ViewQuery viewQuery, TimeSpan duration, Type?
484490
// Modern metrics
485491
var modernTags = new TagList
486492
{
493+
new(ModernAttributes.SystemName, ModernAttributes.SystemValue),
487494
new(ModernAttributes.Service, OuterRequestSpans.ServiceSpan.ViewQuery),
488495
new(ModernAttributes.Namespace, viewQuery.BucketName),
489496
new(ModernAttributes.Outcome, GetOutcome(errorType))
@@ -591,6 +598,7 @@ public static void TrackOperation(
591598
// Modern metrics
592599
var modernTags = new TagList
593600
{
601+
new(ModernAttributes.SystemName, ModernAttributes.SystemValue),
594602
new(ModernAttributes.Service, OuterRequestSpans.ServiceSpan.Management),
595603
new(ModernAttributes.Operation, operationName),
596604
new(ModernAttributes.Outcome, GetOutcome(errorType))
@@ -605,6 +613,60 @@ public static void TrackOperation(
605613
}
606614
}
607615

616+
/// <summary>
617+
/// Metrics for Stellar (Protostellar / CNG) operations that flow through
618+
/// <see cref="Couchbase.Stellar.Core.Retry.StellarRetryHandler"/>.
619+
/// Mirrors the tag schema of <see cref="KeyValue"/> and <see cref="Management"/>
620+
/// so that the same OTel dashboards work across both paths.
621+
/// </summary>
622+
public static class Stellar
623+
{
624+
/// <summary>
625+
/// Tracks a completed Stellar operation (success or failure).
626+
/// Called from <c>StellarRequest.StopRecording</c>.
627+
/// </summary>
628+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
629+
public static void TrackOperation(
630+
string serviceName,
631+
string operationName,
632+
TimeSpan duration,
633+
Type? errorType,
634+
string? bucketName = null,
635+
string? scopeName = null,
636+
string? collectionName = null,
637+
IRequestSpan? span = null)
638+
{
639+
// Legacy metrics
640+
var legacyTags = new TagList
641+
{
642+
new(OuterRequestSpans.Attributes.Service, serviceName),
643+
new(OuterRequestSpans.Attributes.Operation, operationName),
644+
new(OuterRequestSpans.Attributes.Outcome, GetOutcome(errorType))
645+
};
646+
if (bucketName is not null) legacyTags.Add(OuterRequestSpans.Attributes.BucketName, bucketName);
647+
if (scopeName is not null) legacyTags.Add(OuterRequestSpans.Attributes.ScopeName, scopeName);
648+
if (collectionName is not null) legacyTags.Add(OuterRequestSpans.Attributes.CollectionName, collectionName);
649+
650+
legacyTags.AddClusterLabelsIfProvided(span);
651+
LegacyOperations.Record(duration.ToMicroseconds(), legacyTags);
652+
653+
// Modern metrics
654+
var modernTags = new TagList
655+
{
656+
new(ModernAttributes.SystemName, ModernAttributes.SystemValue),
657+
new(ModernAttributes.Service, serviceName),
658+
new(ModernAttributes.Operation, operationName),
659+
new(ModernAttributes.Outcome, GetOutcome(errorType))
660+
};
661+
if (bucketName is not null) modernTags.Add(ModernAttributes.Namespace, bucketName);
662+
if (scopeName is not null) modernTags.Add(ModernAttributes.ScopeName, scopeName);
663+
if (collectionName is not null) modernTags.Add(ModernAttributes.CollectionName, collectionName);
664+
665+
AddModernClusterLabels(ref modernTags, span);
666+
ModernOperations.Record(duration.TotalSeconds, modernTags);
667+
}
668+
}
669+
608670
private static void AddModernClusterLabels(ref TagList tagList, IRequestSpan? span)
609671
{
610672
if (span is RequestSpanWrapper wrapper)

src/Couchbase/Stellar/Analytics/StellarAnalyticsClient.cs

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
using System;
33
using System.Threading.Tasks;
44
using Couchbase.Analytics;
5+
using Couchbase.Core.Diagnostics.Tracing;
6+
using Couchbase.Core.Retry;
57
using Couchbase.Protostellar.Analytics.V1;
68
using Couchbase.Stellar.Core;
9+
using Couchbase.Stellar.Core.Retry;
710

811
namespace Couchbase.Stellar.Analytics;
912

@@ -13,18 +16,32 @@ internal class StellarAnalyticsClient : AnalyticsService.AnalyticsServiceClient,
1316
{
1417
private readonly StellarCluster _stellarCluster;
1518
private readonly AnalyticsService.AnalyticsServiceClient _analyticsClient;
19+
private readonly IRetryOrchestrator _retryHandler;
1620

17-
public StellarAnalyticsClient(StellarCluster stellarCluster)
21+
public StellarAnalyticsClient(StellarCluster stellarCluster, AnalyticsService.AnalyticsServiceClient analyticsClient, IRetryOrchestrator retryHandler)
1822
{
1923
_stellarCluster = stellarCluster;
20-
_analyticsClient = new AnalyticsService.AnalyticsServiceClient(_stellarCluster.GrpcChannel);
24+
_analyticsClient = analyticsClient;
25+
_retryHandler = retryHandler;
2126
}
2227

2328
public DateTime? LastActivity { get; }
2429

25-
public Task<IAnalyticsResult<T>> QueryAsync<T>(string statement, AnalyticsOptions options)
30+
public async Task<IAnalyticsResult<T>> QueryAsync<T>(string statement, AnalyticsOptions options)
2631
{
2732
var opts = options?.AsReadOnly() ?? AnalyticsOptions.DefaultReadOnly;
33+
34+
using var childSpan = _stellarCluster.RequestTracer.RequestSpan(OuterRequestSpans.ServiceSpan.AnalyticsQuery, opts.RequestSpan);
35+
if (childSpan.CanWrite)
36+
{
37+
childSpan.SetAttribute(OuterRequestSpans.Attributes.System.Key, OuterRequestSpans.Attributes.System.Value);
38+
childSpan.SetAttribute(OuterRequestSpans.Attributes.Service, OuterRequestSpans.ServiceSpan.AnalyticsQuery);
39+
childSpan.SetAttribute(OuterRequestSpans.Attributes.Operation, OuterRequestSpans.ServiceSpan.AnalyticsQuery);
40+
childSpan.SetAttribute(OuterRequestSpans.Attributes.Statement, statement);
41+
if (opts.BucketName != null) childSpan.SetAttribute(OuterRequestSpans.Attributes.BucketName, opts.BucketName);
42+
if (opts.ScopeName != null) childSpan.SetAttribute(OuterRequestSpans.Attributes.ScopeName, opts.ScopeName);
43+
}
44+
2845
var request = new AnalyticsQueryRequest
2946
{
3047
Statement = statement,
@@ -34,12 +51,29 @@ public Task<IAnalyticsResult<T>> QueryAsync<T>(string statement, AnalyticsOption
3451
};
3552
if (opts.BucketName != null) request.BucketName = opts.BucketName;
3653
if (opts.ScopeName != null) request.ScopeName = opts.ScopeName;
37-
if (opts.ClientContextId != null) request.ReadOnly = opts.Readonly;
54+
if (opts.ClientContextId != null) request.ClientContextId = opts.ClientContextId;
3855

39-
var callOptions = _stellarCluster.GrpcCallOptions(opts.Timeout, opts.Token);
40-
var response = _analyticsClient.AnalyticsQuery(request, callOptions);
56+
var stellarRequest = new StellarRequest
57+
{
58+
Idempotent = true,
59+
Token = opts.Token,
60+
Timeout = opts.Timeout ?? _stellarCluster.ClusterOptions.AnalyticsTimeout
61+
};
62+
stellarRequest.SetMetrics(
63+
OuterRequestSpans.ServiceSpan.AnalyticsQuery,
64+
OuterRequestSpans.ServiceSpan.AnalyticsQuery,
65+
childSpan,
66+
opts.BucketName,
67+
opts.ScopeName);
68+
69+
async Task<IAnalyticsResult<T>> GrpcCall()
70+
{
71+
var callOptions = _stellarCluster.GrpcCallOptions(stellarRequest.RemainingTimeout, opts.Token);
72+
var response = _analyticsClient.AnalyticsQuery(request, callOptions);
73+
return new ProtoAnalyticsResult<T>(response, _stellarCluster.TypeSerializer);
74+
}
4175

42-
return Task.FromResult<IAnalyticsResult<T>>(new ProtoAnalyticsResult<T>(response, _stellarCluster.TypeSerializer));
76+
return await _retryHandler.RetryAsync(GrpcCall, stellarRequest).ConfigureAwait(false);
4377
}
4478
}
4579
#endif

src/Couchbase/Stellar/Core/Retry/StellarRequest.cs

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
using System.Collections.Generic;
33
using System.Threading;
44
using Couchbase.Core.Diagnostics.Metrics;
5+
using Couchbase.Core.Diagnostics.Tracing;
56
using Couchbase.Core.Retry;
7+
using Couchbase.Utils;
68

79
#nullable enable
810
namespace Couchbase.Stellar.Core.Retry;
@@ -14,6 +16,7 @@ namespace Couchbase.Stellar.Core.Retry;
1416
public class StellarRequest : IRequest
1517
{
1618
private readonly TimeProvider _timeProvider;
19+
private LightweightStopwatch _stopwatch;
1720

1821
public StellarRequest() : this(TimeProvider.System) { }
1922

@@ -24,14 +27,15 @@ internal StellarRequest(TimeProvider timeProvider)
2427
{
2528
_timeProvider = timeProvider;
2629
CreatedAt = _timeProvider.GetUtcNow().UtcDateTime;
30+
_stopwatch = LightweightStopwatch.StartNew();
2731
}
2832

2933
public uint Attempts { get; set; }
3034
public bool Idempotent { get; set; }
3135
public List<RetryReason> RetryReasons { get; set; } = new();
3236
public IRetryStrategy RetryStrategy { get; set; } = new BestEffortRetryStrategy();
3337
public TimeSpan Timeout { get; set; }
34-
public TimeSpan Elapsed { get; }
38+
public TimeSpan Elapsed => _stopwatch.Elapsed;
3539

3640
/// <summary>
3741
/// The time this request was created. Used to compute remaining timeout.
@@ -51,20 +55,69 @@ internal StellarRequest(TimeProvider timeProvider)
5155
public string? ClientContextId { get; set; }
5256
public string? Statement { get; set; }
5357

58+
internal string? ServiceName { get; set; }
59+
internal string? OperationName { get; set; }
60+
internal string? BucketName { get; set; }
61+
internal string? ScopeName { get; set; }
62+
internal string? CollectionName { get; set; }
63+
internal IRequestSpan? Span { get; set; }
64+
private bool _recordingStopped;
65+
66+
/// <summary>
67+
/// Configures the telemetry context for this request.
68+
/// Must be called before the request enters the retry loop.
69+
/// </summary>
70+
internal void SetMetrics(
71+
string serviceName,
72+
string operationName,
73+
IRequestSpan? span,
74+
string? bucketName = null,
75+
string? scopeName = null,
76+
string? collectionName = null)
77+
{
78+
ServiceName = serviceName;
79+
OperationName = operationName;
80+
Span = span;
81+
BucketName = bucketName;
82+
ScopeName = scopeName;
83+
CollectionName = collectionName;
84+
}
85+
5486
public void StopRecording()
5587
{
56-
throw new NotImplementedException();
88+
StopRecording(errorType: null);
5789
}
5890

59-
public void StopRecording(Type? errorType)
91+
public void StopRecording(System.Type? errorType)
6092
{
61-
throw new NotImplementedException();
93+
if (_recordingStopped) return;
94+
_recordingStopped = true;
95+
96+
var elapsed = _stopwatch.Elapsed;
97+
98+
Span?.SetStatus(errorType == null
99+
? RequestSpanStatusCode.Ok
100+
: RequestSpanStatusCode.Error);
101+
Span?.SetAttribute(OuterRequestSpans.Attributes.Retries, Attempts);
102+
103+
if (ServiceName is not null && OperationName is not null)
104+
{
105+
MetricTracker.Stellar.TrackOperation(
106+
ServiceName,
107+
OperationName,
108+
elapsed,
109+
errorType,
110+
BucketName,
111+
ScopeName,
112+
CollectionName,
113+
Span);
114+
}
62115
}
63116

64-
public IValueRecorder Recorder { get; set; } = null!;
117+
public IValueRecorder Recorder { get; set; } = NoopValueRecorder.Instance;
65118
public void LogOrphaned()
66119
{
67-
throw new NotImplementedException();
120+
// No orphan tracking for Stellar/gRPC requests
68121
}
69122

70123
public GenericErrorContext Context { get; set; } = new();

src/Couchbase/Stellar/Core/Retry/StellarRetryHandler.cs

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,40 +36,56 @@ public async Task<T> RetryAsync<T>(Func<Task<T>> send, IRequest request) where T
3636
{
3737
var backoff = ControlledBackoff.Create(_timeProvider);
3838
var context = new GenericErrorContext();
39+
System.Type? finalErrorType = null;
3940

40-
while (true)
41+
try
4142
{
42-
if (request.Token.IsCancellationRequested)
43+
while (true)
4344
{
44-
if (request.Idempotent)
45+
if (request.Token.IsCancellationRequested)
4546
{
46-
throw new UnambiguousTimeoutException("The request timed out.", context);
47-
}
47+
if (request.Idempotent)
48+
{
49+
throw new UnambiguousTimeoutException("The request timed out.", context);
50+
}
4851

49-
throw new AmbiguousTimeoutException("The request timed out.", context);
50-
}
52+
throw new AmbiguousTimeoutException("The request timed out.", context);
53+
}
5154

52-
try
53-
{
54-
return await send().ConfigureAwait(false);
55-
}
56-
catch (RpcException e)
57-
{
58-
if (e.StatusCode != StatusCode.OK)
55+
try
56+
{
57+
return await send().ConfigureAwait(false);
58+
}
59+
catch (RpcException e)
5960
{
60-
HandleException(e, request, context);
61+
if (e.StatusCode != StatusCode.OK)
62+
{
63+
HandleException(e, request, context);
64+
await backoff.Delay(request).ConfigureAwait(false);
65+
}
66+
}
67+
catch (Exception e) when (IsTransientTransportException(e))
68+
{
69+
// HTTP/2 transport failures (e.g. connection reset, broken pipe) can
70+
// surface as HttpRequestException or IOException rather than RpcException.
71+
// Treat these like Unavailable and retry.
72+
context.RetryReasons.Add(RetryReason.ServiceNotAvailable);
73+
request.Attempts++;
6174
await backoff.Delay(request).ConfigureAwait(false);
6275
}
6376
}
64-
catch (Exception e) when (IsTransientTransportException(e))
65-
{
66-
// HTTP/2 transport failures (e.g. connection reset, broken pipe) can
67-
// surface as HttpRequestException or IOException rather than RpcException.
68-
// Treat these like Unavailable and retry.
69-
context.RetryReasons.Add(RetryReason.ServiceNotAvailable);
70-
request.Attempts++;
71-
await backoff.Delay(request).ConfigureAwait(false);
72-
}
77+
}
78+
catch (Exception ex)
79+
{
80+
finalErrorType = ex.GetType();
81+
throw;
82+
}
83+
finally
84+
{
85+
if (request is StellarRequest stellarRequest)
86+
stellarRequest.StopRecording(finalErrorType);
87+
else
88+
request.StopRecording();
7389
}
7490
}
7591

0 commit comments

Comments
 (0)