offendingcommit
diff --git a/‎src/llm/api.py‎
Lines changed: 97 additions & 43 deletions b/‎src/llm/api.py‎
Lines changed: 97 additions & 43 deletions
diff --git a/‎src/llm/tool_loop.py‎
Lines changed: 17 additions & 0 deletions b/‎src/llm/tool_loop.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/llm/types.py‎
Lines changed: 3 additions & 0 deletions b/‎src/llm/types.py‎
Lines changed: 3 additions & 0 deletions
@@ -21,6 +21,11 @@
 
 from src.config import ConfiguredModelSettings, ModelConfig
 from src.exceptions import ValidationException
+from src.telemetry.llm_call_metrics import (
+    finalize_success,
+    mark_max_iterations,
+    observe_llm_call,
+)
 from src.telemetry.logging import conditional_observe
 from src.telemetry.reasoning_traces import log_reasoning_trace
 
@@ -193,6 +198,11 @@ async def honcho_llm_call(
     # tenacity uses 1-indexed attempts.
     current_attempt.set(1)
 
+    # Captures the AttemptPlan that produced the most recent (and on success,
+    # the winning) call so observability can label by the model that actually
+    # answered — primary on early attempts, backup on the final retry.
+    last_plan: dict[str, AttemptPlan | None] = {"value": None}
+
     def _get_attempt_plan() -> AttemptPlan:
         plan = plan_attempt(
             runtime_model_config=runtime_model_config,
@@ -201,6 +211,7 @@ def _get_attempt_plan() -> AttemptPlan:
             call_thinking_budget_tokens=thinking_budget_tokens,
             call_reasoning_effort=reasoning_effort,
         )
+        last_plan["value"] = plan
         update_current_langfuse_observation(
             plan.provider,
             plan.model,
@@ -304,11 +315,92 @@ def _trace_stop_seqs() -> list[str] | None:
             stop_seqs if stop_seqs is not None else runtime_model_config.stop_sequences
         )
 
-    # Tool-less path: call once and return.
-    if not tools or not tool_executor:
-        result: (
-            HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
-        ) = await decorated()
+    with observe_llm_call(
+        track_name=track_name,
+        trace_name=trace_name,
+        runtime_model_config=runtime_model_config,
+    ) as obs_state:
+        # Tool-less path: call once and return.
+        if not tools or not tool_executor:
+            result: (
+                HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
+            ) = await decorated()
+            response_for_metrics = (
+                result if isinstance(result, HonchoLLMCallResponse) else None
+            )
+            winning = last_plan["value"]
+            finalize_success(
+                obs_state,
+                response=response_for_metrics,
+                final_provider=str(winning.provider) if winning else None,
+                final_model=winning.model if winning else None,
+                attempts=current_attempt.get(),
+                iterations=None,
+                has_backup=runtime_model_config.fallback is not None,
+            )
+            if trace_name and isinstance(result, HonchoLLMCallResponse):
+                log_reasoning_trace(
+                    task_type=trace_name,
+                    model_config=runtime_model_config,
+                    prompt=prompt,
+                    response=result,
+                    max_tokens=max_tokens,
+                    thinking_budget_tokens=_trace_thinking_budget(),
+                    reasoning_effort=_trace_reasoning_effort(),
+                    json_mode=json_mode,
+                    stop_seqs=_trace_stop_seqs(),
+                    messages=messages,
+                )
+            return result
+
+        # execute_tool_loop raises ValidationException on out-of-range
+        # max_tool_iterations; fail-fast is cheaper than silent clamping here.
+        result = await execute_tool_loop(
+            prompt=prompt,
+            max_tokens=max_tokens,
+            messages=messages,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_executor=tool_executor,
+            max_tool_iterations=max_tool_iterations,
+            response_model=response_model,
+            json_mode=json_mode,
+            temperature=temperature,
+            stop_seqs=stop_seqs,
+            verbosity=verbosity,
+            enable_retry=enable_retry,
+            retry_attempts=retry_attempts,
+            max_input_tokens=max_input_tokens,
+            get_attempt_plan=_get_attempt_plan,
+            before_retry_callback=before_retry_callback,
+            stream_final=stream_final_only,
+            iteration_callback=iteration_callback,
+            track_name=track_name,
+            trace_name=trace_name,
+        )
+        response_for_metrics = (
+            result if isinstance(result, HonchoLLMCallResponse) else None
+        )
+        winning = last_plan["value"]
+        iterations = (
+            response_for_metrics.iterations
+            if response_for_metrics
+            else (getattr(result, "iterations", None))
+        )
+        finalize_success(
+            obs_state,
+            response=response_for_metrics,
+            final_provider=str(winning.provider) if winning else None,
+            final_model=winning.model if winning else None,
+            attempts=current_attempt.get(),
+            iterations=iterations,
+            has_backup=runtime_model_config.fallback is not None,
+        )
+        if response_for_metrics is not None and getattr(
+            response_for_metrics, "hit_max_iterations", False
+        ):
+            mark_max_iterations(obs_state, iterations or max_tool_iterations)
+
         if trace_name and isinstance(result, HonchoLLMCallResponse):
             log_reasoning_trace(
                 task_type=trace_name,
@@ -324,43 +416,5 @@ def _trace_stop_seqs() -> list[str] | None:
             )
         return result
 
-    # execute_tool_loop raises ValidationException on out-of-range
-    # max_tool_iterations; fail-fast is cheaper than silent clamping here.
-    result = await execute_tool_loop(
-        prompt=prompt,
-        max_tokens=max_tokens,
-        messages=messages,
-        tools=tools,
-        tool_choice=tool_choice,
-        tool_executor=tool_executor,
-        max_tool_iterations=max_tool_iterations,
-        response_model=response_model,
-        json_mode=json_mode,
-        temperature=temperature,
-        stop_seqs=stop_seqs,
-        verbosity=verbosity,
-        enable_retry=enable_retry,
-        retry_attempts=retry_attempts,
-        max_input_tokens=max_input_tokens,
-        get_attempt_plan=_get_attempt_plan,
-        before_retry_callback=before_retry_callback,
-        stream_final=stream_final_only,
-        iteration_callback=iteration_callback,
-    )
-    if trace_name and isinstance(result, HonchoLLMCallResponse):
-        log_reasoning_trace(
-            task_type=trace_name,
-            model_config=runtime_model_config,
-            prompt=prompt,
-            response=result,
-            max_tokens=max_tokens,
-            thinking_budget_tokens=_trace_thinking_budget(),
-            reasoning_effort=_trace_reasoning_effort(),
-            json_mode=json_mode,
-            stop_seqs=_trace_stop_seqs(),
-            messages=messages,
-        )
-    return result
-
 
 __all__ = ["honcho_llm_call"]
@@ -20,6 +20,8 @@
 
 from src.config import ModelTransport
 from src.exceptions import ValidationException
+from src.telemetry.llm_call_metrics import normalize_feature_label
+from src.telemetry.prometheus import prometheus_metrics
 from src.utils.types import set_current_iteration
 
 from .executor import honcho_llm_call_inner
@@ -166,6 +168,8 @@ async def execute_tool_loop(
     before_retry_callback: Callable[[Any], None],
     stream_final: bool = False,
     iteration_callback: IterationCallback | None = None,
+    track_name: str | None = None,
+    trace_name: str | None = None,
 ) -> HonchoLLMCallResponse[Any] | StreamingResponseWithMetadata:
     """Run the iterative tool calling loop for agentic LLM interactions.
 
@@ -188,6 +192,8 @@ async def execute_tool_loop(
             + f"got {max_tool_iterations}"
         )
 
+    feature_label = normalize_feature_label(track_name, trace_name)
+
     conversation_messages: list[dict[str, Any]] = (
         messages.copy() if messages else [{"role": "user", "content": prompt}]
     )
@@ -351,6 +357,11 @@ async def _call_with_messages(
                         "tool_result": tool_result,
                     }
                 )
+                prometheus_metrics.record_llm_tool_call(
+                    feature=feature_label,
+                    tool_name=tool_name,
+                    outcome="success",
+                )
             except Exception as e:
                 logger.error(f"Tool execution failed for {tool_name}: {e}")
                 tool_results.append(
@@ -361,6 +372,11 @@ async def _call_with_messages(
                         "is_error": True,
                     }
                 )
+                prometheus_metrics.record_llm_tool_call(
+                    feature=feature_label,
+                    tool_name=tool_name,
+                    outcome="error",
+                )
 
         append_tool_results(current_provider, tool_results, conversation_messages)
 
@@ -470,6 +486,7 @@ async def _final_call() -> HonchoLLMCallResponse[Any]:
     final_response = await final_call_func()
     final_response.tool_calls_made = all_tool_calls
     final_response.iterations = iteration + 1
+    final_response.hit_max_iterations = True
     final_response.input_tokens = total_input_tokens + final_response.input_tokens
     final_response.output_tokens = total_output_tokens + final_response.output_tokens
     final_response.cache_creation_input_tokens = (
 
@@ -66,6 +66,9 @@ class HonchoLLMCallResponse(BaseModel, Generic[T]):
     tool_calls_made: list[dict[str, Any]] = Field(default_factory=list)
     iterations: int = 0
     """Number of LLM calls made in the tool execution loop."""
+    hit_max_iterations: bool = False
+    """True when the tool loop exited via the max-iterations synthesis path
+    rather than the model deciding to stop. Telemetry-only signal."""
     thinking_content: str | None = None
     # Full thinking blocks with signatures for multi-turn replay (Anthropic only).
     thinking_blocks: list[dict[str, Any]] = Field(default_factory=list)