feat: add LiteLLM as AI gateway provider (#10)

RheagalFire · web-flow · commit 4ee769baa33d · 2026-05-04T22:46:45.000+08:00
diff --git a/corecoder/cli.py b/corecoder/cli.py
@@ -12,7 +12,7 @@
 from prompt_toolkit.key_binding import KeyBindings
 
 from .agent import Agent
-from .llm import LLM
+from .llm import LLM, LiteLLM
 from .config import Config
 from .session import save_session, load_session, list_sessions
 from . import __version__
@@ -62,7 +62,8 @@ def main():
         )
         sys.exit(1)
 
-    llm = LLM(
+    llm_cls = LiteLLM if config.provider == "litellm" else LLM
+    llm = llm_cls(
         model=config.model,
         api_key=config.api_key,
         base_url=config.base_url,
diff --git a/corecoder/config.py b/corecoder/config.py
@@ -33,6 +33,7 @@ class Config:
     max_tokens: int = 4096
     temperature: float = 0.0
     max_context_tokens: int = 128_000
+    provider: str = "openai"
 
     @classmethod
     def from_env(cls) -> "Config":
@@ -52,4 +53,5 @@ def from_env(cls) -> "Config":
             max_tokens=int(os.getenv("CORECODER_MAX_TOKENS", "4096")),
             temperature=float(os.getenv("CORECODER_TEMPERATURE", "0")),
             max_context_tokens=int(os.getenv("CORECODER_MAX_CONTEXT", "128000")),
+            provider=os.getenv("CORECODER_PROVIDER", "openai"),
         )
diff --git a/corecoder/llm.py b/corecoder/llm.py
@@ -3,6 +3,10 @@
 Since most providers (DeepSeek, Qwen, Kimi, GLM, Ollama, etc.) expose an
 OpenAI-compatible endpoint, we just use the openai SDK directly.  Switch
 provider by changing OPENAI_BASE_URL + OPENAI_API_KEY. That's it.
+
+For providers that are NOT OpenAI-compatible (AWS Bedrock, Google Vertex,
+etc.), use the LiteLLM backend which routes to 100+ providers through a
+single unified interface. Set CORECODER_PROVIDER=litellm.
 """
 
 import json
@@ -197,3 +201,127 @@ def _call_with_retry(self, params: dict, max_retries: int = 3):
                     time.sleep(2 ** attempt)
                 else:
                     raise
+
+
+class LiteLLM(LLM):
+    """LLM backend via LiteLLM, supporting 100+ providers.
+
+    Use this when your target provider is NOT OpenAI-compatible
+    (AWS Bedrock, Google Vertex, Cohere, etc.) or when you want
+    a single interface to switch between any provider by changing
+    the model string.
+
+    Set CORECODER_PROVIDER=litellm and use LiteLLM model strings
+    like ``anthropic/claude-3-haiku``, ``bedrock/anthropic.claude-v2``,
+    ``vertex_ai/gemini-pro``, etc.
+    """
+
+    def __init__(
+        self,
+        model: str,
+        api_key: str | None = None,
+        base_url: str | None = None,
+        **kwargs,
+    ):
+        # skip LLM.__init__ which creates an OpenAI client
+        self.model = model
+        self.api_key = api_key
+        self.base_url = base_url
+        self.extra = kwargs
+        self.total_prompt_tokens = 0
+        self.total_completion_tokens = 0
+
+    def chat(
+        self,
+        messages: list[dict],
+        tools: list[dict] | None = None,
+        on_token=None,
+    ) -> LLMResponse:
+        """Send messages via litellm, stream back response, handle tool calls."""
+        params: dict = {
+            "model": self.model,
+            "messages": messages,
+            "stream": True,
+            **self.extra,
+        }
+        if tools:
+            params["tools"] = tools
+
+        stream = self._call_with_retry(params)
+
+        content_parts: list[str] = []
+        tc_map: dict[int, dict] = {}
+        prompt_tok = 0
+        completion_tok = 0
+
+        for chunk in stream:
+            usage = getattr(chunk, "usage", None)
+            if usage:
+                prompt_tok = getattr(usage, "prompt_tokens", 0) or 0
+                completion_tok = getattr(usage, "completion_tokens", 0) or 0
+
+            if not getattr(chunk, "choices", None):
+                continue
+            delta = chunk.choices[0].delta
+
+            if getattr(delta, "content", None):
+                content_parts.append(delta.content)
+                if on_token:
+                    on_token(delta.content)
+
+            if getattr(delta, "tool_calls", None):
+                for tc_delta in delta.tool_calls:
+                    idx = tc_delta.index
+                    if idx not in tc_map:
+                        tc_map[idx] = {"id": "", "name": "", "args": ""}
+                    if tc_delta.id:
+                        tc_map[idx]["id"] = tc_delta.id
+                    if tc_delta.function:
+                        if tc_delta.function.name:
+                            tc_map[idx]["name"] = tc_delta.function.name
+                        if tc_delta.function.arguments:
+                            tc_map[idx]["args"] += tc_delta.function.arguments
+
+        parsed: list[ToolCall] = []
+        for idx in sorted(tc_map):
+            raw = tc_map[idx]
+            try:
+                args = json.loads(raw["args"])
+            except (json.JSONDecodeError, KeyError):
+                args = {}
+            parsed.append(ToolCall(id=raw["id"], name=raw["name"], arguments=args))
+
+        self.total_prompt_tokens += prompt_tok
+        self.total_completion_tokens += completion_tok
+
+        return LLMResponse(
+            content="".join(content_parts),
+            tool_calls=parsed,
+            prompt_tokens=prompt_tok,
+            completion_tokens=completion_tok,
+        )
+
+    def _call_with_retry(self, params: dict, max_retries: int = 3):
+        """Retry on transient errors with exponential backoff via litellm."""
+        import litellm
+
+        params["drop_params"] = True
+        if self.api_key:
+            params["api_key"] = self.api_key
+        if self.base_url:
+            params["api_base"] = self.base_url
+
+        for attempt in range(max_retries):
+            try:
+                return litellm.completion(**params)
+            except Exception as e:
+                err = str(e).lower()
+                is_transient = any(
+                    kw in err
+                    for kw in ["rate_limit", "timeout", "connection", "502", "503", "529"]
+                )
+                is_server = any(kw in err for kw in ["500", "502", "503", "504"])
+                if (is_transient or is_server) and attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+                else:
+                    raise
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
 packages = ["corecoder"]
 
 [project.optional-dependencies]
+litellm = ["litellm>=1.60.0,<2.0.0"]
 dev = ["pytest>=7.0"]
 
 [project.scripts]
diff --git a/tests/test_litellm.py b/tests/test_litellm.py