Skip to content

Commit 11cd7c2

Browse files
feat(llm): wire observe_llm_call into honcho_llm_call
Wraps the body of honcho_llm_call (both tool-less and tool-loop paths) in observe_llm_call(...) so every invocation produces one set of Prometheus samples and one logfmt log line. Captures the AttemptPlan that produced the most-recent (and on success, the winning) call via a `last_plan` cell updated inside _get_attempt_plan, so the recorded provider/model is the one that actually answered — primary on early attempts, backup on the final retry. This makes backup-on-final-attempt observable directly from llm_calls / llm_tokens without parsing logs. Passes track_name and trace_name through to execute_tool_loop so its per-tool counter (added in the previous commit) carries the same feature label as the call-level metrics. When the tool loop returns response.hit_max_iterations=True, the call's outcome is overridden to error_max_iterations via mark_max_iterations so dashboards can split "model didn't converge" from clean success without the tool-loop having to know about outcome semantics. Streaming responses don't carry token counts at the entry point — the recorded call still emits but token counters skip those rows (record_llm_tokens silently no-ops on count<=0). Acceptable partial signal until streaming refactor surfaces tokens earlier. ruff + basedpyright clean. End-to-end smoke verified all six series fire correctly across success, success_via_backup, error_max_iterations, error_timeout, and tool-call paths.
1 parent 873e5e3 commit 11cd7c2

1 file changed

Lines changed: 97 additions & 43 deletions

File tree

src/llm/api.py

Lines changed: 97 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@
2121

2222
from src.config import ConfiguredModelSettings, ModelConfig
2323
from src.exceptions import ValidationException
24+
from src.telemetry.llm_call_metrics import (
25+
finalize_success,
26+
mark_max_iterations,
27+
observe_llm_call,
28+
)
2429
from src.telemetry.logging import conditional_observe
2530
from src.telemetry.reasoning_traces import log_reasoning_trace
2631

@@ -193,6 +198,11 @@ async def honcho_llm_call(
193198
# tenacity uses 1-indexed attempts.
194199
current_attempt.set(1)
195200

201+
# Captures the AttemptPlan that produced the most recent (and on success,
202+
# the winning) call so observability can label by the model that actually
203+
# answered — primary on early attempts, backup on the final retry.
204+
last_plan: dict[str, AttemptPlan | None] = {"value": None}
205+
196206
def _get_attempt_plan() -> AttemptPlan:
197207
plan = plan_attempt(
198208
runtime_model_config=runtime_model_config,
@@ -201,6 +211,7 @@ def _get_attempt_plan() -> AttemptPlan:
201211
call_thinking_budget_tokens=thinking_budget_tokens,
202212
call_reasoning_effort=reasoning_effort,
203213
)
214+
last_plan["value"] = plan
204215
update_current_langfuse_observation(
205216
plan.provider,
206217
plan.model,
@@ -304,11 +315,92 @@ def _trace_stop_seqs() -> list[str] | None:
304315
stop_seqs if stop_seqs is not None else runtime_model_config.stop_sequences
305316
)
306317

307-
# Tool-less path: call once and return.
308-
if not tools or not tool_executor:
309-
result: (
310-
HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
311-
) = await decorated()
318+
with observe_llm_call(
319+
track_name=track_name,
320+
trace_name=trace_name,
321+
runtime_model_config=runtime_model_config,
322+
) as obs_state:
323+
# Tool-less path: call once and return.
324+
if not tools or not tool_executor:
325+
result: (
326+
HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
327+
) = await decorated()
328+
response_for_metrics = (
329+
result if isinstance(result, HonchoLLMCallResponse) else None
330+
)
331+
winning = last_plan["value"]
332+
finalize_success(
333+
obs_state,
334+
response=response_for_metrics,
335+
final_provider=str(winning.provider) if winning else None,
336+
final_model=winning.model if winning else None,
337+
attempts=current_attempt.get(),
338+
iterations=None,
339+
has_backup=runtime_model_config.fallback is not None,
340+
)
341+
if trace_name and isinstance(result, HonchoLLMCallResponse):
342+
log_reasoning_trace(
343+
task_type=trace_name,
344+
model_config=runtime_model_config,
345+
prompt=prompt,
346+
response=result,
347+
max_tokens=max_tokens,
348+
thinking_budget_tokens=_trace_thinking_budget(),
349+
reasoning_effort=_trace_reasoning_effort(),
350+
json_mode=json_mode,
351+
stop_seqs=_trace_stop_seqs(),
352+
messages=messages,
353+
)
354+
return result
355+
356+
# execute_tool_loop raises ValidationException on out-of-range
357+
# max_tool_iterations; fail-fast is cheaper than silent clamping here.
358+
result = await execute_tool_loop(
359+
prompt=prompt,
360+
max_tokens=max_tokens,
361+
messages=messages,
362+
tools=tools,
363+
tool_choice=tool_choice,
364+
tool_executor=tool_executor,
365+
max_tool_iterations=max_tool_iterations,
366+
response_model=response_model,
367+
json_mode=json_mode,
368+
temperature=temperature,
369+
stop_seqs=stop_seqs,
370+
verbosity=verbosity,
371+
enable_retry=enable_retry,
372+
retry_attempts=retry_attempts,
373+
max_input_tokens=max_input_tokens,
374+
get_attempt_plan=_get_attempt_plan,
375+
before_retry_callback=before_retry_callback,
376+
stream_final=stream_final_only,
377+
iteration_callback=iteration_callback,
378+
track_name=track_name,
379+
trace_name=trace_name,
380+
)
381+
response_for_metrics = (
382+
result if isinstance(result, HonchoLLMCallResponse) else None
383+
)
384+
winning = last_plan["value"]
385+
iterations = (
386+
response_for_metrics.iterations
387+
if response_for_metrics
388+
else (getattr(result, "iterations", None))
389+
)
390+
finalize_success(
391+
obs_state,
392+
response=response_for_metrics,
393+
final_provider=str(winning.provider) if winning else None,
394+
final_model=winning.model if winning else None,
395+
attempts=current_attempt.get(),
396+
iterations=iterations,
397+
has_backup=runtime_model_config.fallback is not None,
398+
)
399+
if response_for_metrics is not None and getattr(
400+
response_for_metrics, "hit_max_iterations", False
401+
):
402+
mark_max_iterations(obs_state, iterations or max_tool_iterations)
403+
312404
if trace_name and isinstance(result, HonchoLLMCallResponse):
313405
log_reasoning_trace(
314406
task_type=trace_name,
@@ -324,43 +416,5 @@ def _trace_stop_seqs() -> list[str] | None:
324416
)
325417
return result
326418

327-
# execute_tool_loop raises ValidationException on out-of-range
328-
# max_tool_iterations; fail-fast is cheaper than silent clamping here.
329-
result = await execute_tool_loop(
330-
prompt=prompt,
331-
max_tokens=max_tokens,
332-
messages=messages,
333-
tools=tools,
334-
tool_choice=tool_choice,
335-
tool_executor=tool_executor,
336-
max_tool_iterations=max_tool_iterations,
337-
response_model=response_model,
338-
json_mode=json_mode,
339-
temperature=temperature,
340-
stop_seqs=stop_seqs,
341-
verbosity=verbosity,
342-
enable_retry=enable_retry,
343-
retry_attempts=retry_attempts,
344-
max_input_tokens=max_input_tokens,
345-
get_attempt_plan=_get_attempt_plan,
346-
before_retry_callback=before_retry_callback,
347-
stream_final=stream_final_only,
348-
iteration_callback=iteration_callback,
349-
)
350-
if trace_name and isinstance(result, HonchoLLMCallResponse):
351-
log_reasoning_trace(
352-
task_type=trace_name,
353-
model_config=runtime_model_config,
354-
prompt=prompt,
355-
response=result,
356-
max_tokens=max_tokens,
357-
thinking_budget_tokens=_trace_thinking_budget(),
358-
reasoning_effort=_trace_reasoning_effort(),
359-
json_mode=json_mode,
360-
stop_seqs=_trace_stop_seqs(),
361-
messages=messages,
362-
)
363-
return result
364-
365419

366420
__all__ = ["honcho_llm_call"]

0 commit comments

Comments
 (0)