fix: Properly cleanup Langfuse tracing context after pipeline run failures (#1999)

vblagoje · sjrl · web-flow · commit 4157b0b51a2d · 2025-07-01T14:07:19.000+02:00
* Properly cleanup context after pipeline run failures

* Format

* Update integrations/langfuse/src/haystack_integrations/tracing/langfuse/tracer.py

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Lint

* PR feedback

* Small fix

* Small nit

---------

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;
diff --git a/integrations/langfuse/src/haystack_integrations/tracing/langfuse/tracer.py b/integrations/langfuse/src/haystack_integrations/tracing/langfuse/tracer.py
@@ -449,22 +449,33 @@ def trace(
         self._context.append(span)
         span.set_tags(tags)
 
-        yield span
-
-        # Let the span handler process the span
-        self._span_handler.handle(span, component_type)
-
-        # In this section, we finalize both regular spans and generation spans created using the LangfuseSpan class.
-        # It's important to end() these spans to ensure they are properly closed and all relevant data is recorded.
-        # Note that we do not call end() on the main trace span itself (StatefulTraceClient), as its lifecycle is
-        # managed differently.
-        raw_span = span.raw_span()
-        if isinstance(raw_span, (StatefulSpanClient, StatefulGenerationClient)):
-            raw_span.end()
-        self._context.pop()
-
-        if self.enforce_flush:
-            self.flush()
+        try:
+            yield span
+        finally:
+            # Always clean up context, even if nested operations fail
+            try:
+                # Process span data (may fail with nested pipeline exceptions)
+                self._span_handler.handle(span, component_type)
+
+                # End span (may fail if span data is corrupted)
+                raw_span = span.raw_span()
+                if isinstance(raw_span, (StatefulSpanClient, StatefulGenerationClient)):
+                    raw_span.end()
+            except Exception as cleanup_error:
+                # Log cleanup errors but don't let them corrupt context
+                logger.warning(
+                    "Error during span cleanup for {operation_name}: {cleanup_error}",
+                    operation_name=operation_name,
+                    cleanup_error=cleanup_error,
+                )
+            finally:
+                # CRITICAL: Always pop context to prevent corruption
+                # This is especially important for nested pipeline scenarios
+                if self._context and self._context[-1] == span:
+                    self._context.pop()
+
+            if self.enforce_flush:
+                self.flush()
 
     def flush(self) -> None:
         self._tracer.flush()
diff --git a/integrations/langfuse/tests/test_tracer.py b/integrations/langfuse/tests/test_tracer.py
@@ -3,15 +3,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import datetime
+import json
 import logging
 import sys
-from unittest.mock import MagicMock, Mock, patch
 from typing import Optional
+from unittest.mock import MagicMock, Mock, patch
 
 import pytest
+from haystack import Pipeline, component
 from haystack.dataclasses import ChatMessage, ToolCall
-from haystack_integrations.tracing.langfuse.tracer import LangfuseTracer, LangfuseSpan, SpanContext, DefaultSpanHandler
-from haystack_integrations.tracing.langfuse.tracer import _COMPONENT_OUTPUT_KEY
+
+from haystack_integrations.components.connectors.langfuse import LangfuseConnector
+from haystack_integrations.tracing.langfuse.tracer import (
+    _COMPONENT_OUTPUT_KEY, DefaultSpanHandler, LangfuseSpan, LangfuseTracer,
+    SpanContext)
 
 
 class MockSpan:
@@ -367,7 +372,8 @@ def test_update_span_flush_disable(self, monkeypatch):
         monkeypatch.setenv("HAYSTACK_LANGFUSE_ENFORCE_FLUSH", "false")
         tracer_mock = Mock()
 
-        from haystack_integrations.tracing.langfuse.tracer import LangfuseTracer
+        from haystack_integrations.tracing.langfuse.tracer import \
+            LangfuseTracer
 
         tracer = LangfuseTracer(tracer=tracer_mock, name="Haystack", public=False)
         with tracer.trace(operation_name="operation_name", tags={"haystack.pipeline.input_data": "hello"}) as span:
@@ -397,3 +403,58 @@ def test_init_with_tracing_disabled(self, monkeypatch, caplog):
 
             LangfuseTracer(tracer=MockTracer(), name="Haystack", public=False)
             assert "tracing is disabled" in caplog.text
+
+    def test_context_cleanup_after_nested_failures(self):
+        """
+        Test that tracer context is properly cleaned up even when nested operations fail.
+
+        This test addresses a critical bug where failing nested operations (like inner pipelines)
+        could corrupt the tracing context, leaving stale spans that affect subsequent operations.
+        The fix ensures proper cleanup through try/finally blocks.
+
+        Before the fix: context would retain spans after failures (length > 0)
+        After the fix: context is always cleaned up (length == 0)
+        """
+
+
+        @component
+        class FailingParser:
+            @component.output_types(result=str)
+            def run(self, data: str):
+                # This will fail with ValueError when data is not valid JSON
+                parsed = json.loads(data)
+                return {"result": parsed["key"]}
+
+        @component
+        class ComponentWithNestedPipeline:
+            def __init__(self):
+                # This simulates IntentClassifier's internal pipeline
+                self.internal_pipeline = Pipeline()
+                self.internal_pipeline.add_component("parser", FailingParser())
+
+            @component.output_types(result=str)
+            def run(self, input_data: str):
+                # Run nested pipeline - this is where corruption occurs
+                result = self.internal_pipeline.run({"parser": {"data": input_data}})
+                return {"result": result["parser"]["result"]}
+
+        tracer = LangfuseConnector("test")
+
+        main_pipeline = Pipeline()
+        main_pipeline.add_component("nested_component", ComponentWithNestedPipeline())
+        main_pipeline.add_component("tracer", tracer)
+
+        # Test 1: First run will fail and should clean up context
+        try:
+            main_pipeline.run({"nested_component": {"input_data": "invalid json"}})
+        except Exception:
+            pass  # Expected to fail
+
+        # Critical assertion: context should be empty after failed operation
+        assert len(tracer.tracer._context) == 0
+
+        # Test 2: Second run should work normally with clean context
+        main_pipeline.run({"nested_component": {"input_data": '{"key": "valid"}'}})
+        
+        # Critical assertion: context should be empty after successful operation
+        assert len(tracer.tracer._context) == 0