fix(tracing): migrate RemoteRolloutProcessor to /traces endpoint

xzrderek · xzrderek · commit aa476f00befa · 2026-04-20T18:12:48.000-07:00
The `/v1/traces/pointwise` endpoint was removed upstream in favor of a
unified `/v1/traces` endpoint. This patches `FireworksTracingAdapter`
and its default data loader to match the new contract.

Wire changes:
- URL: `/v1/traces/pointwise` → `/v1/traces` (both the default path and
  the `project_id`-scoped variant).
- Query params: the old endpoint accepted `tags=rollout_id:&lt;id&gt;` as the
  only way to scope a request; the new one expects `rollout_id` as a
  top-level query parameter. `get_evaluation_rows` now extracts the
  rollout id from the `tags` kwarg so existing callers don't break, and
  raises `ValueError` if no `rollout_id:&lt;id&gt;` tag is supplied.
- Response shape: the new endpoint returns flat row dicts with
  PascalCase keys (`Input`, `Output`, `Tags`, `InsertionId`) instead of
  the old nested snake_case shape with an `observations[]` array. The
  converter now reads the new keys and drops the "fall back to last
  GENERATION observation" branch, which has no equivalent server-side
  concept anymore.
- `session_data["langfuse_trace_id"]` is now sourced from
  `InsertionId` so downstream consumers that key on that field keep
  working.

Default data loader in `tracing_utils.py` now asks for `limit=1` since
`update_row_with_remote_trace` only consumes a single row and raises on
multi-row responses — `max_retries=5` was a no-op knob for the old
Langfuse-polling path and the new endpoint doesn't expose it.

Made-with: Cursor
diff --git a/eval_protocol/adapters/fireworks_tracing.py b/eval_protocol/adapters/fireworks_tracing.py
@@ -65,19 +65,22 @@ def convert_trace_dict_to_evaluation_row(
         # Extract messages from trace input and output
         messages = extract_messages_from_trace_dict(trace, include_tool_calls, span_name)
 
-        # Extract tools if available
+        # Extract tools if available. `Input` carries the request payload,
+        # which optionally includes a `tools` array when tool-calling is used.
         tools = None
-        if include_tool_calls and isinstance(trace.get("input"), dict) and "tools" in trace["input"]:
-            tools = trace["input"]["tools"]
+        trace_input = trace.get("Input")
+        if include_tool_calls and isinstance(trace_input, dict) and "tools" in trace_input:
+            tools = trace_input["tools"]
 
         if not messages:
             return None
 
         execution_metadata = ExecutionMetadata()
         row_id = None
 
-        # Extract metadata from tags
-        tags = trace.get("tags", [])
+        # Extract metadata from tags. `Tags` may be absent or null on a row
+        # that was written without any, so coalesce to an empty list.
+        tags = trace.get("Tags") or []
         if tags:
             for tag in tags:
                 if tag.startswith("invocation_id:"):
@@ -106,14 +109,16 @@ def convert_trace_dict_to_evaluation_row(
             input_metadata=InputMetadata(
                 row_id=row_id,
                 session_data={
-                    "langfuse_trace_id": trace.get("id"),  # Store the trace ID here
+                    # Historical key name kept for downstream compatibility;
+                    # sourced from the per-LLM-call identifier on the trace.
+                    "langfuse_trace_id": trace.get("InsertionId"),
                 },
             ),
             execution_metadata=execution_metadata,
         )
 
     except (AttributeError, ValueError, KeyError) as e:
-        logger.error("Error converting trace %s: %s", trace.get("id"), e)
+        logger.error("Error converting trace %s: %s", trace.get("InsertionId"), e)
         return None
 
 
@@ -153,28 +158,15 @@ def extract_messages_from_trace_dict(
 
     else:
         try:
-            # Extract messages from trace input and output
-            if trace.get("input"):
-                messages.extend(extract_messages_from_data(trace["input"], include_tool_calls))
-            if trace.get("output"):
-                messages.extend(extract_messages_from_data(trace["output"], include_tool_calls))
+            # `Input` carries the request messages; `Output` carries the
+            # assistant message returned for this call. `extract_messages_from_data`
+            # accepts both `{"messages": [...]}` and single message dicts.
+            if trace.get("Input"):
+                messages.extend(extract_messages_from_data(trace["Input"], include_tool_calls))
+            if trace.get("Output"):
+                messages.extend(extract_messages_from_data(trace["Output"], include_tool_calls))
         except (AttributeError, ValueError, KeyError) as e:
-            logger.warning("Error processing trace %s: %s", trace.get("id"), e)
-
-        # Fallback: use the last GENERATION observation which typically contains full chat history
-        if not messages:
-            try:
-                all_observations = trace.get("observations", [])
-                gens = [obs for obs in all_observations if obs.get("type") == "GENERATION"]
-                if gens:
-                    gens.sort(key=lambda x: x.get("start_time", ""))
-                    last_gen = gens[-1]
-                    if last_gen.get("input"):
-                        messages.extend(extract_messages_from_data(last_gen["input"], include_tool_calls))
-                    if last_gen.get("output"):
-                        messages.extend(extract_messages_from_data(last_gen["output"], include_tool_calls))
-            except Exception as e:
-                logger.warning("Failed to extract from last generation for trace %s: %s", trace.get("id"), e)
+            logger.warning("Error processing trace %s: %s", trace.get("InsertionId"), e)
 
     return messages
 
@@ -429,13 +421,21 @@ def get_evaluation_rows(
         if not tags or len(tags) == 0:
             raise ValueError("At least one tag is required to fetch traces")
 
+        # Pull out rollout_id only, since that is the task-level id needed to fetch traces.
+        rollout_id = next(
+            (t.split(":", 1)[1] for t in tags if t.startswith("rollout_id:")),
+            None,
+        )
+        if not rollout_id:
+            raise ValueError("tags must contain a 'rollout_id:<id>' entry")
+
         eval_rows = []
 
         # Build query parameters for GET request
         params = {
+            "rollout_id": rollout_id,
             "limit": limit,
             "sample_size": sample_size,
-            "tags": tags,
             "user_id": user_id,
             "session_id": session_id,
             "name": name,
@@ -453,11 +453,11 @@ def get_evaluation_rows(
         # Remove None values
         params = {k: v for k, v in params.items() if v is not None}
 
-        # Make request to proxy (using pointwise for efficiency)
+        # Make request to proxy
         if self.project_id:
-            url = f"{self.base_url}/v1/project_id/{self.project_id}/traces/pointwise"
+            url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
         else:
-            url = f"{self.base_url}/v1/traces/pointwise"
+            url = f"{self.base_url}/v1/traces"
 
         headers = {
             "Authorization": f"Bearer {self._get_api_key()}",
@@ -500,7 +500,7 @@ def get_evaluation_rows(
                 if eval_row:
                     eval_rows.append(eval_row)
             except (AttributeError, ValueError, KeyError) as e:
-                logger.warning("Failed to convert trace %s: %s", trace.get("id"), e)
+                logger.warning("Failed to convert trace %s: %s", trace.get("InsertionId"), e)
                 continue
 
         logger.info("Successfully converted %d traces to evaluation rows", len(eval_rows))
diff --git a/eval_protocol/pytest/tracing_utils.py b/eval_protocol/pytest/tracing_utils.py
@@ -15,14 +15,19 @@
 
 
 def default_fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader:
-    """Default output data loader that fetches traces from Fireworks tracing proxy."""
+    """Default output data loader that fetches traces from Fireworks tracing proxy.
+
+    Requests a single trace per rollout — `update_row_with_remote_trace` in
+    this module only consumes one row and raises if more come back, so
+    pulling the full list would just waste bytes on the wire.
+    """
 
     def fetch_traces() -> List[EvaluationRow]:
         base_url = config.model_base_url or "https://tracing.fireworks.ai"
         # Use EP_REMOTE_API_KEY for fetching remote traces, falling back to FIREWORKS_API_KEY
         api_key = os.environ.get("EP_REMOTE_API_KEY") or os.environ.get("FIREWORKS_API_KEY")
         adapter = FireworksTracingAdapter(base_url=base_url, api_key=api_key)
-        return adapter.get_evaluation_rows(tags=[f"rollout_id:{config.rollout_id}"], max_retries=5)
+        return adapter.get_evaluation_rows(tags=[f"rollout_id:{config.rollout_id}"], limit=1)
 
     return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=filter_longest_conversation)