parallel-multi-agent-codegen/agents/llm_utils.py at main · tathadn/parallel-multi-agent-codegen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Shared LLM utilities — native Anthropic SDK with prompt caching, retries, and usage tracking."""

from __future__ import annotations

import json
import re
import time
from typing import Any, Optional

import anthropic
from langsmith import traceable
from tenacity import (
    RetryError,
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)

from agents.pricing import compute_cost
from models.errors import (
    LLMBadRequest,
    LLMRateLimited,
    LLMTimeout,
    ParseFailure,
)
from models.state import LLMUsage

_client = anthropic.Anthropic()

# Retry policy: transient provider errors only. BadRequestError (400) is a
# programmer bug — never retry, it just burns money.
_RETRYABLE_EXC: tuple[type[BaseException], ...] = (
    anthropic.APIConnectionError,
    anthropic.APITimeoutError,
    anthropic.RateLimitError,
    anthropic.InternalServerError,
)


@retry(
    retry=retry_if_exception_type(_RETRYABLE_EXC),
    wait=wait_exponential(multiplier=1, min=2, max=30),
    stop=stop_after_attempt(4),
    reraise=True,
)
def _create_message(
    model_name: str,
    system: str,
    prompt: str,
) -> Any:
    """Thin wrapper around client.messages.create for retry composition."""
    return _client.messages.create(
        model=model_name,
        max_tokens=8192,
        system=[
            {
                "type": "text",
                "text": system,
                "cache_control": {"type": "ephemeral"},
            }
        ],  # type: ignore[list-item]
        messages=[{"role": "user", "content": prompt}],
    )


@traceable(run_type="llm", name="call_llm")
def call_llm(
    system: str,
    prompt: str,
    model_name: str = "claude-sonnet-4-20250514",
    usage_sink: Optional[list[LLMUsage]] = None,
    agent_label: str = "unknown",
) -> str:
    """Send a system + human message and return the raw text response.

    Uses cache_control=ephemeral on the system prompt to enable prompt caching,
    cutting input costs by 60–90% after the first call. Retries transient
    provider errors up to 4 times with exponential backoff. If usage_sink is
    provided, appends an LLMUsage record with token counts and cost.

    Raises:
        LLMBadRequest: 400 from provider (programmer error, not retryable).
        LLMRateLimited: persistent 429/529 after retries exhausted.
        LLMTimeout: connection/timeout errors after retries exhausted.
    """
    t_start = time.time()
    try:
        response = _create_message(model_name, system, prompt)
    except anthropic.BadRequestError as e:
        raise LLMBadRequest(f"{model_name}: {e}") from e
    except anthropic.RateLimitError as e:
        raise LLMRateLimited(f"{model_name}: {e}") from e
    except (anthropic.APIConnectionError, anthropic.APITimeoutError) as e:
        raise LLMTimeout(f"{model_name}: {e}") from e
    except RetryError as e:
        raise LLMTimeout(f"{model_name}: retries exhausted ({e})") from e

    if usage_sink is not None:
        u = getattr(response, "usage", None)
        if u is not None:
            entry = LLMUsage(
                agent=agent_label,
                model=model_name,
                input_tokens=getattr(u, "input_tokens", 0) or 0,
                cached_input_tokens=getattr(u, "cache_read_input_tokens", 0) or 0,
                cache_creation_tokens=getattr(u, "cache_creation_input_tokens", 0) or 0,
                output_tokens=getattr(u, "output_tokens", 0) or 0,
                latency_s=round(time.time() - t_start, 3),
            )
            entry.cost_usd = compute_cost(
                model_name,
                entry.input_tokens,
                entry.cached_input_tokens,
                entry.cache_creation_tokens,
                entry.output_tokens,
            )
            usage_sink.append(entry)

    block = response.content[0]
    if hasattr(block, "text"):
        return block.text  # type: ignore[union-attr]
    raise ValueError(f"Unexpected response block type: {type(block)}")


def parse_json_response(text: str) -> Any:
    """Extract and parse JSON from an LLM response.

    Handles common quirks: markdown fences, leading prose, trailing text.
    Raises ParseFailure if no valid JSON can be extracted.
    """
    # Strip markdown code fences
    text = re.sub(r"```(?:json)?\s*", "", text)
    text = re.sub(r"```\s*$", "", text)

    # Try direct parse first
    text = text.strip()
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Try to find the first JSON array or object
    for pattern in [r"(\[[\s\S]*\])", r"(\{[\s\S]*\})"]:
        match = re.search(pattern, text)
        if match:
            try:
                return json.loads(match.group(1))
            except json.JSONDecodeError:
                continue

    raise ParseFailure(f"Could not parse JSON from LLM response:\n{text[:500]}")