feat(llm): wire observe_llm_call into honcho_llm_call

offendingcommit · offendingcommit · commit 11cd7c2fa9d4 · 2026-05-03T21:32:11.000-05:00
Wraps the body of honcho_llm_call (both tool-less and tool-loop paths)
in observe_llm_call(...) so every invocation produces one set of
Prometheus samples and one logfmt log line.

Captures the AttemptPlan that produced the most-recent (and on success,
the winning) call via a `last_plan` cell updated inside _get_attempt_plan,
so the recorded provider/model is the one that actually answered —
primary on early attempts, backup on the final retry. This makes
backup-on-final-attempt observable directly from llm_calls / llm_tokens
without parsing logs.

Passes track_name and trace_name through to execute_tool_loop so its
per-tool counter (added in the previous commit) carries the same
feature label as the call-level metrics.

When the tool loop returns response.hit_max_iterations=True, the call's
outcome is overridden to error_max_iterations via mark_max_iterations
so dashboards can split "model didn't converge" from clean success
without the tool-loop having to know about outcome semantics.

Streaming responses don't carry token counts at the entry point —
the recorded call still emits but token counters skip those rows
(record_llm_tokens silently no-ops on count&lt;=0). Acceptable partial
signal until streaming refactor surfaces tokens earlier.

ruff + basedpyright clean. End-to-end smoke verified all six series
fire correctly across success, success_via_backup, error_max_iterations,
error_timeout, and tool-call paths.
diff --git a/src/llm/api.py b/src/llm/api.py
@@ -21,6 +21,11 @@
 
 from src.config import ConfiguredModelSettings, ModelConfig
 from src.exceptions import ValidationException
+from src.telemetry.llm_call_metrics import (
+    finalize_success,
+    mark_max_iterations,
+    observe_llm_call,
+)
 from src.telemetry.logging import conditional_observe
 from src.telemetry.reasoning_traces import log_reasoning_trace
 
@@ -193,6 +198,11 @@ async def honcho_llm_call(
     # tenacity uses 1-indexed attempts.
     current_attempt.set(1)
 
+    # Captures the AttemptPlan that produced the most recent (and on success,
+    # the winning) call so observability can label by the model that actually
+    # answered — primary on early attempts, backup on the final retry.
+    last_plan: dict[str, AttemptPlan | None] = {"value": None}
+
     def _get_attempt_plan() -> AttemptPlan:
         plan = plan_attempt(
             runtime_model_config=runtime_model_config,
@@ -201,6 +211,7 @@ def _get_attempt_plan() -> AttemptPlan:
             call_thinking_budget_tokens=thinking_budget_tokens,
             call_reasoning_effort=reasoning_effort,
         )
+        last_plan["value"] = plan
         update_current_langfuse_observation(
             plan.provider,
             plan.model,
@@ -304,11 +315,92 @@ def _trace_stop_seqs() -> list[str] | None:
             stop_seqs if stop_seqs is not None else runtime_model_config.stop_sequences
         )
 
-    # Tool-less path: call once and return.
-    if not tools or not tool_executor:
-        result: (
-            HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
-        ) = await decorated()
+    with observe_llm_call(
+        track_name=track_name,
+        trace_name=trace_name,
+        runtime_model_config=runtime_model_config,
+    ) as obs_state:
+        # Tool-less path: call once and return.
+        if not tools or not tool_executor:
+            result: (
+                HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
+            ) = await decorated()
+            response_for_metrics = (
+                result if isinstance(result, HonchoLLMCallResponse) else None
+            )
+            winning = last_plan["value"]
+            finalize_success(
+                obs_state,
+                response=response_for_metrics,
+                final_provider=str(winning.provider) if winning else None,
+                final_model=winning.model if winning else None,
+                attempts=current_attempt.get(),
+                iterations=None,
+                has_backup=runtime_model_config.fallback is not None,
+            )
+            if trace_name and isinstance(result, HonchoLLMCallResponse):
+                log_reasoning_trace(
+                    task_type=trace_name,
+                    model_config=runtime_model_config,
+                    prompt=prompt,
+                    response=result,
+                    max_tokens=max_tokens,
+                    thinking_budget_tokens=_trace_thinking_budget(),
+                    reasoning_effort=_trace_reasoning_effort(),
+                    json_mode=json_mode,
+                    stop_seqs=_trace_stop_seqs(),
+                    messages=messages,
+                )
+            return result
+
+        # execute_tool_loop raises ValidationException on out-of-range
+        # max_tool_iterations; fail-fast is cheaper than silent clamping here.
+        result = await execute_tool_loop(
+            prompt=prompt,
+            max_tokens=max_tokens,
+            messages=messages,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_executor=tool_executor,
+            max_tool_iterations=max_tool_iterations,
+            response_model=response_model,
+            json_mode=json_mode,
+            temperature=temperature,
+            stop_seqs=stop_seqs,
+            verbosity=verbosity,
+            enable_retry=enable_retry,
+            retry_attempts=retry_attempts,
+            max_input_tokens=max_input_tokens,
+            get_attempt_plan=_get_attempt_plan,
+            before_retry_callback=before_retry_callback,
+            stream_final=stream_final_only,
+            iteration_callback=iteration_callback,
+            track_name=track_name,
+            trace_name=trace_name,
+        )
+        response_for_metrics = (
+            result if isinstance(result, HonchoLLMCallResponse) else None
+        )
+        winning = last_plan["value"]
+        iterations = (
+            response_for_metrics.iterations
+            if response_for_metrics
+            else (getattr(result, "iterations", None))
+        )
+        finalize_success(
+            obs_state,
+            response=response_for_metrics,
+            final_provider=str(winning.provider) if winning else None,
+            final_model=winning.model if winning else None,
+            attempts=current_attempt.get(),
+            iterations=iterations,
+            has_backup=runtime_model_config.fallback is not None,
+        )
+        if response_for_metrics is not None and getattr(
+            response_for_metrics, "hit_max_iterations", False
+        ):
+            mark_max_iterations(obs_state, iterations or max_tool_iterations)
+
         if trace_name and isinstance(result, HonchoLLMCallResponse):
             log_reasoning_trace(
                 task_type=trace_name,
@@ -324,43 +416,5 @@ def _trace_stop_seqs() -> list[str] | None:
             )
         return result
 
-    # execute_tool_loop raises ValidationException on out-of-range
-    # max_tool_iterations; fail-fast is cheaper than silent clamping here.
-    result = await execute_tool_loop(
-        prompt=prompt,
-        max_tokens=max_tokens,
-        messages=messages,
-        tools=tools,
-        tool_choice=tool_choice,
-        tool_executor=tool_executor,
-        max_tool_iterations=max_tool_iterations,
-        response_model=response_model,
-        json_mode=json_mode,
-        temperature=temperature,
-        stop_seqs=stop_seqs,
-        verbosity=verbosity,
-        enable_retry=enable_retry,
-        retry_attempts=retry_attempts,
-        max_input_tokens=max_input_tokens,
-        get_attempt_plan=_get_attempt_plan,
-        before_retry_callback=before_retry_callback,
-        stream_final=stream_final_only,
-        iteration_callback=iteration_callback,
-    )
-    if trace_name and isinstance(result, HonchoLLMCallResponse):
-        log_reasoning_trace(
-            task_type=trace_name,
-            model_config=runtime_model_config,
-            prompt=prompt,
-            response=result,
-            max_tokens=max_tokens,
-            thinking_budget_tokens=_trace_thinking_budget(),
-            reasoning_effort=_trace_reasoning_effort(),
-            json_mode=json_mode,
-            stop_seqs=_trace_stop_seqs(),
-            messages=messages,
-        )
-    return result
-
 
 __all__ = ["honcho_llm_call"]