forked from plastic-labs/honcho
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtypes.py
More file actions
141 lines (113 loc) · 4.57 KB
/
types.py
File metadata and controls
141 lines (113 loc) · 4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""Public response/stream/iteration types for the LLM API.
These used to live in src/utils/clients.py and have been moved here as part
of the migration toward src/llm/ owning all non-embedding LLM orchestration.
"""
from __future__ import annotations
from collections.abc import AsyncIterator, Callable
from dataclasses import dataclass
from typing import Any, Generic, Literal, TypeVar
from anthropic import AsyncAnthropic
from google import genai
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
T = TypeVar("T")
# OpenAI GPT-5 specific reasoning levels.
ReasoningEffortType = (
Literal["none", "minimal", "low", "medium", "high", "xhigh", "max"] | None
)
VerbosityType = Literal["low", "medium", "high"] | None
# Raw SDK client union used by the provider-selection layer.
ProviderClient = AsyncAnthropic | AsyncOpenAI | genai.Client
@dataclass
class IterationData:
"""Data passed to iteration callbacks after each tool execution loop iteration."""
iteration: int
"""1-indexed iteration number."""
tool_calls: list[str]
"""List of tool names called in this iteration."""
input_tokens: int
"""Input tokens used in this iteration's LLM call."""
output_tokens: int
"""Output tokens generated in this iteration's LLM call."""
cache_read_tokens: int = 0
"""Tokens read from cache in this iteration."""
cache_creation_tokens: int = 0
"""Tokens written to cache in this iteration."""
IterationCallback = Callable[[IterationData], None]
class HonchoLLMCallResponse(BaseModel, Generic[T]):
"""Response object for LLM calls.
Note:
Uncached input tokens = input_tokens - cache_read_input_tokens
+ cache_creation_input_tokens
(cache_creation costs 25% more, cache_read costs 90% less)
"""
content: T
input_tokens: int = 0
output_tokens: int
cache_creation_input_tokens: int = 0
cache_read_input_tokens: int = 0
finish_reasons: list[str]
tool_calls_made: list[dict[str, Any]] = Field(default_factory=list)
iterations: int = 0
"""Number of LLM calls made in the tool execution loop."""
hit_max_iterations: bool = False
"""True when the tool loop exited via the max-iterations synthesis path
rather than the model deciding to stop. Telemetry-only signal."""
thinking_content: str | None = None
# Full thinking blocks with signatures for multi-turn replay (Anthropic only).
thinking_blocks: list[dict[str, Any]] = Field(default_factory=list)
# OpenRouter reasoning_details for Gemini models — must be preserved across turns.
reasoning_details: list[dict[str, Any]] = Field(default_factory=list)
class HonchoLLMCallStreamChunk(BaseModel):
"""A single chunk in a streaming LLM response."""
content: str
is_done: bool = False
finish_reasons: list[str] = Field(default_factory=list)
output_tokens: int | None = None
class StreamingResponseWithMetadata:
"""Streaming response wrapper carrying metadata from a completed tool loop.
Lets callers read tool_calls_made / token counts / thinking_content from
the tool-execution phase while still iterating the final streamed answer.
"""
_stream: AsyncIterator[HonchoLLMCallStreamChunk]
tool_calls_made: list[dict[str, Any]]
input_tokens: int
output_tokens: int
cache_creation_input_tokens: int
cache_read_input_tokens: int
thinking_content: str | None
iterations: int
def __init__(
self,
stream: AsyncIterator[HonchoLLMCallStreamChunk],
tool_calls_made: list[dict[str, Any]],
input_tokens: int,
output_tokens: int,
cache_creation_input_tokens: int,
cache_read_input_tokens: int,
thinking_content: str | None = None,
iterations: int = 0,
):
self._stream = stream
self.tool_calls_made = tool_calls_made
self.input_tokens = input_tokens
self.output_tokens = output_tokens
self.cache_creation_input_tokens = cache_creation_input_tokens
self.cache_read_input_tokens = cache_read_input_tokens
self.thinking_content = thinking_content
self.iterations = iterations
def __aiter__(self) -> AsyncIterator[HonchoLLMCallStreamChunk]:
return self._stream.__aiter__()
async def __anext__(self) -> HonchoLLMCallStreamChunk:
return await self._stream.__anext__()
__all__ = [
"HonchoLLMCallResponse",
"HonchoLLMCallStreamChunk",
"IterationCallback",
"IterationData",
"ProviderClient",
"ReasoningEffortType",
"StreamingResponseWithMetadata",
"T",
"VerbosityType",
]