Skip to content

Commit e1a28b0

Browse files
feat: support BaseLlm instances as judge_model in evaluation
Allow JudgeModelOptions.judge_model to accept Union[str, BaseLlm] instead of only str. This enables custom/self-hosted models (e.g. LiteLlm with custom base_url) to be used as judge models for evaluation without requiring LLMRegistry registration. Follows the same pattern used by LlmAgent.model which already accepts Union[str, BaseLlm]. Fixes #3400
1 parent 9fec503 commit e1a28b0

File tree

5 files changed

+32
-16
lines changed

5 files changed

+32
-16
lines changed

src/google/adk/evaluation/eval_metrics.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
from pydantic.json_schema import SkipJsonSchema
2929
from typing_extensions import TypeAlias
3030

31+
from google.adk.models.base_llm import BaseLlm
32+
3133
from .common import EvalBaseModel
3234
from .eval_case import Invocation
3335
from .eval_rubrics import Rubric
@@ -75,10 +77,11 @@ class PrebuiltMetrics(Enum):
7577
class JudgeModelOptions(EvalBaseModel):
7678
"""Options for an eval metric's judge model."""
7779

78-
judge_model: str = Field(
80+
judge_model: Union[str, BaseLlm] = Field(
7981
default="gemini-2.5-flash",
8082
description=(
81-
"The judge model to use for evaluation. It can be a model name."
83+
"The judge model to use for evaluation. It can be a model name"
84+
" string or a BaseLlm instance for custom/self-hosted models."
8285
),
8386
)
8487

src/google/adk/evaluation/hallucinations_v1.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -294,17 +294,20 @@ def __init__(self, eval_metric: EvalMetric):
294294
self._judge_model = self._setup_auto_rater()
295295
self.segmenter_prompt = _HALLUCINATIONS_V1_SEGMENTER_PROMPT
296296
self.sentence_validator_prompt = _HALLUCINATIONS_V1_VALIDATOR_PROMPT
297-
self._model = self._judge_model_options.judge_model
297+
judge_model = self._judge_model_options.judge_model
298+
self._model = judge_model.model if isinstance(judge_model, BaseLlm) else judge_model
298299
self._model_config = (
299300
self._judge_model_options.judge_model_config
300301
or genai_types.GenerateContentConfig()
301302
)
302303

303304
def _setup_auto_rater(self) -> BaseLlm:
304-
model_id = self._judge_model_options.judge_model
305+
judge_model = self._judge_model_options.judge_model
306+
if isinstance(judge_model, BaseLlm):
307+
return judge_model
305308
llm_registry = LLMRegistry()
306-
llm_class = llm_registry.resolve(model_id)
307-
return llm_class(model=model_id)
309+
llm_class = llm_registry.resolve(judge_model)
310+
return llm_class(model=judge_model)
308311

309312
def _create_context_for_step(
310313
self,

src/google/adk/evaluation/llm_as_judge.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,10 @@ async def evaluate_invocations(
136136
per_invocation_results = []
137137
for actual, expected in zip(actual_invocations, expected_invocations):
138138
auto_rater_prompt = self.format_auto_rater_prompt(actual, expected)
139+
judge_model = self._judge_model_options.judge_model
140+
model_str = judge_model.model if isinstance(judge_model, BaseLlm) else judge_model
139141
llm_request = LlmRequest(
140-
model=self._judge_model_options.judge_model,
142+
model=model_str,
141143
contents=[
142144
genai_types.Content(
143145
parts=[genai_types.Part(text=auto_rater_prompt)],
@@ -181,7 +183,9 @@ async def evaluate_invocations(
181183
return EvaluationResult()
182184

183185
def _setup_auto_rater(self) -> BaseLlm:
184-
model_id = self._judge_model_options.judge_model
186+
judge_model = self._judge_model_options.judge_model
187+
if isinstance(judge_model, BaseLlm):
188+
return judge_model
185189
llm_registry = LLMRegistry()
186-
llm_class = llm_registry.resolve(model_id)
187-
return llm_class(model=model_id)
190+
llm_class = llm_registry.resolve(judge_model)
191+
return llm_class(model=judge_model)

src/google/adk/evaluation/simulation/llm_backed_user_simulator.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from typing_extensions import override
2525

2626
from ...events.event import Event
27+
from ...models.base_llm import BaseLlm
2728
from ...models.llm_request import LlmRequest
2829
from ...models.registry import LLMRegistry
2930
from ...utils.context_utils import Aclosing
@@ -124,9 +125,12 @@ def __init__(
124125
super().__init__(config, config_type=LlmBackedUserSimulator.config_type)
125126
self._conversation_scenario = conversation_scenario
126127
self._invocation_count = 0
127-
llm_registry = LLMRegistry()
128-
llm_class = llm_registry.resolve(self._config.model)
129-
self._llm = llm_class(model=self._config.model)
128+
if isinstance(self._config.model, BaseLlm):
129+
self._llm = self._config.model
130+
else:
131+
llm_registry = LLMRegistry()
132+
llm_class = llm_registry.resolve(self._config.model)
133+
self._llm = llm_class(model=self._config.model)
130134
self._user_persona = self._conversation_scenario.user_persona
131135

132136
@classmethod

src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,12 @@ async def evaluate_invocations(
198198
return self._aggregate_conversation_results(results)
199199

200200
def _setup_llm(self) -> BaseLlm:
201-
model_id = self._llm_options.judge_model
201+
judge_model = self._llm_options.judge_model
202+
if isinstance(judge_model, BaseLlm):
203+
return judge_model
202204
llm_registry = LLMRegistry()
203-
llm_class = llm_registry.resolve(model_id)
204-
return llm_class(model=model_id)
205+
llm_class = llm_registry.resolve(judge_model)
206+
return llm_class(model=judge_model)
205207

206208
def _format_llm_prompt(
207209
self,

0 commit comments

Comments
 (0)