Skip to content

Commit d7bfdba

Browse files
committed
PYCBC-1759: Fix compounding encoding span attribute propagation
Changes ------- * Only acquire lock when setting specific attributes required for threshold logging. * Only update encoding spans cluster_name and cluster_uuid once when processing multi mutation operation core spans. * Only build span snapshot if span is span type that will check a threshold (applies only to threshold logging). Change-Id: Ic5d042dc434fcf1c0d8a6eda350df77d524c4db8 Reviewed-on: https://review.couchbase.org/c/couchbase-python-client/+/243401 Tested-by: Build Bot <build@couchbase.com> Reviewed-by: Dimitris Christodoulou <dimitris.christodoulou@couchbase.com>
1 parent b2fc5d9 commit d7bfdba

2 files changed

Lines changed: 51 additions & 31 deletions

File tree

couchbase/logic/observability/handler.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,11 @@ def _set_attribute_on_all_spans(self,
875875
if isinstance(self._parent_span, WrappedSpan):
876876
self._parent_span.set_attribute(key, value)
877877

878-
if skip_encoding_span is True:
878+
# For multi-mutation ops (insert/upsert/replace multi), process_core_span is called once per result
879+
# in the batch. The first call ends all encoding spans (setting _encoding_spans_ended=True) and
880+
# propagates cluster labels to them at that point. Subsequent calls have nothing left to update on
881+
# the already-ended encoding spans, so skip the iteration to avoid O(N²) attribute propagation.
882+
if skip_encoding_span is True or self._encoding_spans_ended:
879883
return
880884

881885
if isinstance(self._encoding_spans, list):

couchbase/logic/observability/threshold_logging.py

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,19 @@ class IgnoredParentSpan(Enum):
121121

122122
IGNORED_PARENT_SPAN_VALUES = frozenset(span.value for span in IgnoredParentSpan)
123123

124+
# Only these attribute keys are actually processed in ThresholdLoggingSpan.set_attribute.
125+
# We ignore all other attributes, so lets skip the lock acquisition for them.
126+
PROCESSED_ATTRIBUTE_KEYS = frozenset({
127+
OpAttributeName.Service.value,
128+
DispatchAttributeName.ServerDuration.value,
129+
DispatchAttributeName.LocalId.value,
130+
DispatchAttributeName.OperationId.value,
131+
DispatchAttributeName.PeerAddress.value,
132+
DispatchAttributeName.PeerPort.value,
133+
DispatchAttributeName.ServerAddress.value,
134+
DispatchAttributeName.ServerPort.value,
135+
})
136+
124137

125138
@dataclass(frozen=True)
126139
class ThresholdLoggingSpanSnapshot:
@@ -387,7 +400,9 @@ def total_server_duration_ns(self) -> int:
387400
return self._total_server_duration_ns
388401

389402
def set_attribute(self, key: str, value: SpanAttributeValue) -> None: # noqa: C901
390-
propagate_to_parent = True
403+
if key not in PROCESSED_ATTRIBUTE_KEYS:
404+
return
405+
391406
parent_to_update = None
392407

393408
with self._lock:
@@ -408,12 +423,9 @@ def set_attribute(self, key: str, value: SpanAttributeValue) -> None: # noqa: C
408423
self._remote_address = str(value)
409424
elif key == DispatchAttributeName.ServerPort.value:
410425
self._remote_port = int(value)
411-
else:
412-
propagate_to_parent = False
413426

414427
# Capture parent reference inside lock to avoid race condition
415-
if propagate_to_parent:
416-
parent_to_update = self._parent_span
428+
parent_to_update = self._parent_span
417429

418430
# Propagate to parent outside lock to avoid potential deadlock
419431
if parent_to_update:
@@ -432,46 +444,50 @@ def set_status(self, status: SpanStatusCode) -> None:
432444
self._status = status
433445

434446
def end(self, end_time: Optional[int] = None) -> None:
435-
# Idempotent end with lock and snapshot building
447+
# Idempotent end; only compute total duration under the lock.
436448
with self._lock:
437449
if self._end_time_ns is not None:
438-
# Already ended, do nothing
439450
return
440-
441451
self._end_time_ns = end_time if end_time is not None else time_ns()
442452
self._total_duration_ns = self._end_time_ns - self._start_time_ns
443453

444-
# Build snapshot of span state for threshold logging
445-
self._span_snapshot = ThresholdLoggingSpanSnapshot(
446-
name=self._name,
447-
service_type=self._service_type,
448-
total_duration_ns=self._total_duration_ns,
449-
encode_duration_ns=self._total_encode_duration_ns,
450-
dispatch_duration_ns=self._dispatch_duration_ns,
451-
total_dispatch_duration_ns=self._total_dispatch_duration_ns,
452-
server_duration_ns=self._server_duration_ns,
453-
total_server_duration_ns=self._total_server_duration_ns,
454-
local_id=self._local_id,
455-
operation_id=self._operation_id,
456-
local_socket=self.local_socket,
457-
remote_socket=self.remote_socket,
458-
)
459-
460-
# Handle encoding and dispatch duration updates (outside lock to avoid deadlock)
454+
# Encoding/dispatch spans only propagate duration to parent; no snapshot needed.
461455
if self._name == OpAttributeName.EncodingSpanName.value:
462456
self.encode_duration_ns = self._total_duration_ns
457+
return
463458
elif self._name == OpAttributeName.DispatchSpanName.value:
464459
self.dispatch_duration_ns = self._total_duration_ns
465-
elif (self._name in IGNORED_MULTI_OP_SPAN_VALUES # for now multi-op spans are ignored for threshold logging
466-
or (self._parent_span and self._parent_span.name in IGNORED_MULTI_OP_SPAN_VALUES)):
467460
return
468-
elif self._parent_span is not None and self._parent_span.name in IGNORED_PARENT_SPAN_VALUES:
461+
462+
# Multi-op wrapper spans and their direct children are not threshold-checked.
463+
if (self._name in IGNORED_MULTI_OP_SPAN_VALUES
464+
or (self._parent_span and self._parent_span.name in IGNORED_MULTI_OP_SPAN_VALUES)):
465+
return
466+
467+
# Build snapshot only for spans that require threshold evaluation.
468+
snapshot = ThresholdLoggingSpanSnapshot(
469+
name=self._name,
470+
service_type=self._service_type,
471+
total_duration_ns=self._total_duration_ns,
472+
encode_duration_ns=self._total_encode_duration_ns,
473+
dispatch_duration_ns=self._dispatch_duration_ns,
474+
total_dispatch_duration_ns=self._total_dispatch_duration_ns,
475+
server_duration_ns=self._server_duration_ns,
476+
total_server_duration_ns=self._total_server_duration_ns,
477+
local_id=self._local_id,
478+
operation_id=self._operation_id,
479+
local_socket=self.local_socket,
480+
remote_socket=self.remote_socket,
481+
)
482+
self._span_snapshot = snapshot
483+
484+
if self._parent_span is not None and self._parent_span.name in IGNORED_PARENT_SPAN_VALUES:
469485
if self._tracer:
470-
self._tracer.check_threshold(self._span_snapshot)
486+
self._tracer.check_threshold(snapshot)
471487
elif (self._parent_span is None
472488
and self._tracer
473489
and self._name not in IGNORED_PARENT_SPAN_VALUES):
474-
self._tracer.check_threshold(self._span_snapshot)
490+
self._tracer.check_threshold(snapshot)
475491

476492

477493
class ThresholdLoggingTracer(RequestTracer):

0 commit comments

Comments
 (0)