Weave: Group workflow traces under the parent evaluation call (#663)

AnuradhaKaruppiah · web-flow · commit aec097524c5d · 2025-08-21T02:50:07.000Z
When an Evaluation run is started the eval_call id is pushed to the call stack so subsequent traces can be grouped underneath it. This allows the user to debug the predictions, scores and traces for each run easily. - This PR is a workaround as the eval (score) exporting will be migrated to the observability exporter_manager in the future - This workaround only works for local eval. Remote eval is yet to be solved. - This PR also adds a change to wait on the trace export to finish before finishing the evaluation run <img width="2044" height="907" alt="image" src="https://github.com/user-attachments/assets/4f169a63-7152-4b17-a9ea-bdf6ffda9218" /> ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/NVIDIA/NeMo-Agent-Toolkit/blob/develop/docs/source/resources/contributing.md). - We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license. - Any contribution which contains commits that are not Signed-Off will not be accepted. - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Anuradha Karuppiah (https://github.com/AnuradhaKaruppiah) Approvers: - Matthew Penn (https://github.com/mpenn) URL: #663
diff --git a/packages/nvidia_nat_opentelemetry/tests/observability/test_otel_span_adapter_exporter.py b/packages/nvidia_nat_opentelemetry/tests/observability/test_otel_span_adapter_exporter.py
@@ -241,7 +241,7 @@ async def test_end_to_end_span_processing(self, basic_exporter_config, sample_st
                 exporter.export(sample_end_event)
 
                 # Wait for async processing
-                await exporter._wait_for_tasks()
+                await exporter.wait_for_tasks()
 
             # Verify that export was called (span was processed and exported)
             mock_otlp_exporter.export.assert_called()
@@ -295,7 +295,7 @@ async def test_batching_behavior(self, mock_otlp_exporter_class, basic_exporter_
                 exporter.export(end_event)
 
             # Wait for batch processing
-            await exporter._wait_for_tasks()
+            await exporter.wait_for_tasks()
 
         # Verify that export was called (batching should trigger export)
         mock_otlp_exporter.export.assert_called()
diff --git a/packages/nvidia_nat_opentelemetry/tests/observability/test_otel_span_adapter_integration.py b/packages/nvidia_nat_opentelemetry/tests/observability/test_otel_span_adapter_integration.py
@@ -124,7 +124,7 @@ async def test_actual_span_export_to_mock_server(self, mock_otlp_server, sample_
             exporter.export(end_event)
 
             # Wait for async export to complete
-            await exporter._wait_for_tasks()
+            await exporter.wait_for_tasks()
 
             # Give a small buffer for HTTP request to complete
             await asyncio.sleep(0.1)
@@ -158,7 +158,7 @@ async def test_export_error_handling_with_real_endpoint(self, sample_events):
             exporter.export(end_event)
 
             # Wait for export attempt (should fail but not crash)
-            await exporter._wait_for_tasks()
+            await exporter.wait_for_tasks()
             await asyncio.sleep(0.1)
 
         # Test passes if no exception was raised - error should be logged internally
@@ -202,7 +202,7 @@ async def test_span_batching_with_real_export(self, mock_otlp_server):
                 exporter.export(end_event)
 
             # Wait for batch processing
-            await exporter._wait_for_tasks()
+            await exporter.wait_for_tasks()
             await asyncio.sleep(0.1)
 
         # Validate that batch export occurred
@@ -219,7 +219,7 @@ async def test_basic_export_functionality(self, mock_otlp_server, sample_events)
         async with exporter.start():
             exporter.export(start_event)
             exporter.export(end_event)
-            await exporter._wait_for_tasks()
+            await exporter.wait_for_tasks()
             await asyncio.sleep(0.1)
 
         # Validate that spans were exported
diff --git a/src/nat/builder/workflow.py b/src/nat/builder/workflow.py
@@ -83,6 +83,9 @@ def has_single_output(self) -> bool:
 
         return self._entry_fn.has_single_output
 
+    async def get_all_exporters(self) -> dict[str, BaseExporter]:
+        return await self._exporter_manager.get_all_exporters()
+
     @asynccontextmanager
     async def run(self, message: InputT):
         """
diff --git a/src/nat/eval/config.py b/src/nat/eval/config.py
@@ -44,6 +44,8 @@ class EvaluationRunConfig(BaseModel):
     # number of passes at each concurrency, if 0 the dataset is adjusted to a multiple of the
     # concurrency. The is only used if adjust_dataset_size is true
     num_passes: int = 0
+    # timeout for waiting for trace export tasks to complete
+    export_timeout: float = 60.0
 
 
 class EvaluationRunOutput(BaseModel):
diff --git a/src/nat/eval/evaluate.py b/src/nat/eval/evaluate.py
@@ -63,7 +63,16 @@ def __init__(self, config: EvaluationRunConfig):
 
         # Helpers
         self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
-        self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
+
+        # Create evaluation trace context
+        try:
+            from nat.eval.utils.eval_trace_ctx import WeaveEvalTraceContext
+            self.eval_trace_context = WeaveEvalTraceContext()
+        except Exception:
+            from nat.eval.utils.eval_trace_ctx import EvalTraceContext
+            self.eval_trace_context = EvalTraceContext()
+
+        self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration(self.eval_trace_context)
         # Metadata
         self.eval_input: EvalInput | None = None
         self.workflow_interrupted: bool = False
@@ -401,6 +410,33 @@ def _get_workflow_alias(self, workflow_type: str | None = None):
 
         return workflow_type
 
+    async def wait_for_all_export_tasks_local(self, session_manager: SessionManager, timeout: float) -> None:
+        """Wait for all trace export tasks to complete for local workflows.
+
+        This only works for local workflows where we have direct access to the
+        SessionManager and its underlying workflow with exporter manager.
+        """
+        try:
+            workflow = session_manager.workflow
+            all_exporters = await workflow.get_all_exporters()
+            if not all_exporters:
+                logger.debug("No exporters to wait for")
+                return
+
+            logger.info("Waiting for export tasks from %d local exporters (timeout: %ds)", len(all_exporters), timeout)
+
+            for name, exporter in all_exporters.items():
+                try:
+                    await exporter.wait_for_tasks(timeout=timeout)
+                    logger.info("Export tasks completed for exporter: %s", name)
+                except Exception as e:
+                    logger.warning("Error waiting for export tasks from %s: %s", name, e)
+
+            logger.info("All local export task waiting completed")
+
+        except Exception as e:
+            logger.warning("Failed to wait for local export tasks: %s", e)
+
     async def run_and_evaluate(self,
                                session_manager: SessionManager | None = None,
                                job_id: str | None = None) -> EvaluationRunOutput:
@@ -442,11 +478,13 @@ async def run_and_evaluate(self,
         dataset_config = self.eval_config.general.dataset  # Currently only one dataset is supported
         if not dataset_config:
             logger.info("No dataset found, nothing to evaluate")
-            return EvaluationRunOutput(
-                workflow_output_file=self.workflow_output_file,
-                evaluator_output_files=self.evaluator_output_files,
-                workflow_interrupted=self.workflow_interrupted,
-            )
+            return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
+                                       evaluator_output_files=self.evaluator_output_files,
+                                       workflow_interrupted=self.workflow_interrupted,
+                                       eval_input=EvalInput(eval_input_items=[]),
+                                       evaluation_results=[],
+                                       usage_stats=UsageStats(),
+                                       profiler_results=ProfilerResults())
 
         dataset_handler = DatasetHandler(dataset_config=dataset_config,
                                          reps=self.config.reps,
@@ -456,30 +494,37 @@ async def run_and_evaluate(self,
         self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
         if not self.eval_input.eval_input_items:
             logger.info("Dataset is empty. Nothing to evaluate.")
-            return EvaluationRunOutput(
-                workflow_output_file=self.workflow_output_file,
-                evaluator_output_files=self.evaluator_output_files,
-                workflow_interrupted=self.workflow_interrupted,
-            )
+            return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
+                                       evaluator_output_files=self.evaluator_output_files,
+                                       workflow_interrupted=self.workflow_interrupted,
+                                       eval_input=self.eval_input,
+                                       evaluation_results=self.evaluation_results,
+                                       usage_stats=self.usage_stats,
+                                       profiler_results=ProfilerResults())
 
         # Run workflow and evaluate
         async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
             # Initialize Weave integration
             self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
 
             # Run workflow
-            if self.config.endpoint:
-                await self.run_workflow_remote()
-            else:
-                if not self.config.skip_workflow:
-                    if session_manager is None:
-                        session_manager = SessionManager(eval_workflow.build(),
-                                                         max_concurrency=self.eval_config.general.max_concurrency)
-                    await self.run_workflow_local(session_manager)
-
-            # Evaluate
-            evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators}
-            await self.run_evaluators(evaluators)
+            with self.eval_trace_context.evaluation_context():
+                if self.config.endpoint:
+                    await self.run_workflow_remote()
+                else:
+                    if not self.config.skip_workflow:
+                        if session_manager is None:
+                            session_manager = SessionManager(eval_workflow.build(),
+                                                             max_concurrency=self.eval_config.general.max_concurrency)
+                        await self.run_workflow_local(session_manager)
+
+                # Evaluate
+                evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators}
+                await self.run_evaluators(evaluators)
+
+                # Wait for all trace export tasks to complete (local workflows only)
+                if session_manager and not self.config.endpoint:
+                    await self.wait_for_all_export_tasks_local(session_manager, timeout=self.config.export_timeout)
 
         # Profile the workflow
         profiler_results = await self.profile_workflow()
diff --git a/src/nat/eval/utils/eval_trace_ctx.py b/src/nat/eval/utils/eval_trace_ctx.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from collections.abc import Callable
+from contextlib import contextmanager
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Type alias for evaluation call objects that have an optional 'id' attribute
+EvalCallType = Any  # Could be Weave Call object or other tracing framework objects
+
+
+class EvalTraceContext:
+    """
+    Evaluation trace context manager for coordinating traces.
+
+    This class provides a framework-agnostic way to:
+    1. Track evaluation calls/contexts
+    2. Ensure proper parent-child relationships in traces
+    """
+
+    def __init__(self):
+        self.eval_call: EvalCallType | None = None  # Store the evaluation call/context for propagation
+
+    def set_eval_call(self, eval_call: EvalCallType | None) -> None:
+        """Set the evaluation call/context for propagation to traces."""
+        self.eval_call = eval_call
+        if eval_call:
+            logger.debug("Set evaluation call context: %s", getattr(eval_call, 'id', str(eval_call)))
+
+    def get_eval_call(self) -> EvalCallType | None:
+        """Get the current evaluation call/context."""
+        return self.eval_call
+
+    @contextmanager
+    def evaluation_context(self):
+        """
+        Context manager that can be overridden by framework-specific implementations.
+        Default implementation is a no-op.
+        """
+        yield
+
+
+class WeaveEvalTraceContext(EvalTraceContext):
+    """
+    Weave-specific implementation of evaluation trace context.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.available = False
+        self.set_call_stack: Callable[[list[EvalCallType]], Any] | None = None
+
+        try:
+            from weave.trace.context.call_context import set_call_stack
+            self.set_call_stack = set_call_stack
+            self.available = True
+        except ImportError:
+            self.available = False
+            logger.debug("Weave not available for trace context")
+
+    @contextmanager
+    def evaluation_context(self):
+        """Set the evaluation call as active context for Weave traces."""
+        if self.available and self.eval_call and self.set_call_stack:
+            try:
+                with self.set_call_stack([self.eval_call]):
+                    logger.debug("Set Weave evaluation call context: %s",
+                                 getattr(self.eval_call, 'id', str(self.eval_call)))
+                    yield
+            except Exception as e:
+                logger.warning("Failed to set Weave evaluation call context: %s", e)
+                yield
+        else:
+            yield
diff --git a/src/nat/eval/utils/weave_eval.py b/src/nat/eval/utils/weave_eval.py
@@ -15,6 +15,7 @@
 
 import asyncio
 import logging
+from typing import TYPE_CHECKING
 from typing import Any
 
 from nat.eval.evaluator.evaluator_model import EvalInput
@@ -24,6 +25,9 @@
 from nat.eval.usage_stats import UsageStatsItem
 from nat.profiler.data_models import ProfilerResults
 
+if TYPE_CHECKING:
+    from nat.eval.utils.eval_trace_ctx import EvalTraceContext
+
 logger = logging.getLogger(__name__)
 
 
@@ -32,18 +36,19 @@ class WeaveEvaluationIntegration:  # pylint: disable=too-many-public-methods
     Class to handle all Weave integration functionality.
     """
 
-    def __init__(self):
+    def __init__(self, eval_trace_context: "EvalTraceContext"):
         self.available = False
         self.client = None
         self.eval_logger = None
         self.pred_loggers = {}
+        self.eval_trace_context = eval_trace_context
 
         try:
             from weave.flow.eval_imperative import EvaluationLogger
             from weave.flow.eval_imperative import ScoreLogger
             from weave.trace.context import weave_client_context
-            self.EvaluationLogger = EvaluationLogger
-            self.ScoreLogger = ScoreLogger
+            self.evaluation_logger_cls = EvaluationLogger  # pylint: disable=invalid-name
+            self.score_logger_cls = ScoreLogger  # pylint: disable=invalid-name
             self.weave_client_context = weave_client_context
             self.available = True
         except ImportError:
@@ -89,9 +94,12 @@ def initialize_logger(self, workflow_alias: str, eval_input: EvalInput, config:
             weave_dataset = self._get_weave_dataset(eval_input)
             config_dict = config.model_dump(mode="json")
             config_dict["name"] = workflow_alias
-            self.eval_logger = self.EvaluationLogger(model=config_dict, dataset=weave_dataset)
+            self.eval_logger = self.evaluation_logger_cls(model=config_dict, dataset=weave_dataset)
             self.pred_loggers = {}
 
+            # Capture the current evaluation call for context propagation
+            self.eval_trace_context.set_eval_call(self.eval_logger._evaluate_call)
+
             return True
         except Exception as e:
             self.eval_logger = None
@@ -137,7 +145,7 @@ async def alog_score(self, eval_output: EvalOutput, evaluator_name: str):
             await asyncio.gather(*coros)
 
     async def afinish_loggers(self):
-        """Finish all prediction loggers."""
+        """Finish all prediction loggers and wait for exports."""
         if not self.eval_logger:
             return
 
@@ -157,7 +165,6 @@ def _log_profiler_metrics(self, profiler_results: ProfilerResults, usage_stats:
         if profiler_results.workflow_runtime_metrics:
             profile_metrics["wf_runtime_p95"] = profiler_results.workflow_runtime_metrics.p95
 
-        # TODO:get the LLM tokens from the usage stats and log them
         profile_metrics["total_runtime"] = usage_stats.total_runtime
 
         return profile_metrics
diff --git a/src/nat/observability/exporter/base_exporter.py b/src/nat/observability/exporter/base_exporter.py
@@ -357,7 +357,7 @@ async def _cancel_tasks(self):
                 except Exception as e:
                     logger.warning("Error while canceling task %s: %s", task.get_name(), e)
 
-    async def _wait_for_tasks(self, timeout: float = 5.0):
+    async def wait_for_tasks(self, timeout: float = 5.0):
         """Wait for all tracked tasks to complete with a timeout.
 
         Note: This method is NOT called during normal stop() operation for performance.
diff --git a/tests/nat/eval/utils/test_eval_trace_ctx.py b/tests/nat/eval/utils/test_eval_trace_ctx.py
diff --git a/tests/nat/observability/exporter/test_base_exporter.py b/tests/nat/observability/exporter/test_base_exporter.py
diff --git a/tests/nat/observability/exporter/test_span_exporter.py b/tests/nat/observability/exporter/test_span_exporter.py
diff --git a/tests/nat/observability/test_exporter_manager.py b/tests/nat/observability/test_exporter_manager.py