Skip to content

Commit 17da61a

Browse files
authored
Merge pull request lightspeed-core#184 from asamal4/integrate-judge-panel
feat: integrate panel of judges
2 parents 7f260c4 + 8d603e3 commit 17da61a

15 files changed

Lines changed: 1054 additions & 61 deletions

File tree

src/lightspeed_evaluation/core/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@
6060

6161
DEFAULT_API_NUM_RETRIES = 3
6262

63+
# Frameworks that don't require judge LLM (NLP, script-based evaluations)
64+
NON_LLM_FRAMEWORKS = frozenset({"nlp", "script"})
65+
6366
DEFAULT_LLM_PROVIDER = "openai"
6467
DEFAULT_LLM_MODEL = "gpt-4o-mini"
6568
DEFAULT_SSL_VERIFY = True
@@ -96,6 +99,8 @@
9699
"api_output_tokens",
97100
"judge_llm_input_tokens",
98101
"judge_llm_output_tokens",
102+
# Per-judge scores (JSON array with one entry for single judge)
103+
"judge_scores",
99104
# Streaming performance metrics
100105
"time_to_first_token",
101106
"streaming_duration",

src/lightspeed_evaluation/core/llm/manager.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,20 @@ def __init__(
2424
self,
2525
config: LLMConfig,
2626
system_config: Optional[SystemConfig] = None,
27+
judge_id: Optional[str] = None,
2728
):
2829
"""Initialize with validated environment and constructed model name.
2930
3031
Args:
3132
config: Primary LLM configuration (also used as fallback)
3233
system_config: Optional full system config for judge panel support
34+
judge_id: Optional identifier for this judge (pool key). If not provided,
35+
defaults to "primary" for single LLM or the pool key for panel judges.
3336
"""
3437
self.config = config
3538
self.system_config = system_config
3639
self.model_name = self._construct_model_name_and_validate(config)
40+
self.judge_id = judge_id or "primary"
3741

3842
# Initialize judge panel if available
3943
self.judge_managers: list["LLMManager"] = []
@@ -43,9 +47,9 @@ def __init__(
4347
# Create LLM managers for each judge using resolved configs from llms pool
4448
try:
4549
judge_configs = system_config.get_judge_configs()
46-
for resolved_config in judge_configs:
50+
for pool_key, resolved_config in judge_configs:
4751
# Create child manager without system_config to avoid recursion
48-
judge_manager = LLMManager(resolved_config)
52+
judge_manager = LLMManager(resolved_config, judge_id=pool_key)
4953
self.judge_managers.append(judge_manager)
5054
except ValueError as e:
5155
logger.error("Failed to resolve judge panel: %s", e)
@@ -163,6 +167,22 @@ def get_primary_judge(self) -> "LLMManager":
163167
return self.judge_managers[0]
164168
return self
165169

170+
def get_judges_for_metric(self, metric_identifier: str) -> list["LLMManager"]:
171+
"""Get list of judges to use for a specific metric.
172+
173+
Returns all judges if metric should use panel, otherwise returns
174+
list with single primary judge. Always returns a list.
175+
176+
Args:
177+
metric_identifier: Metric identifier (e.g., "ragas:faithfulness")
178+
179+
Returns:
180+
List of LLMManager instances to use for this metric
181+
"""
182+
if self.should_use_panel_for_metric(metric_identifier):
183+
return self.get_judge_managers()
184+
return [self.get_primary_judge()]
185+
166186
def should_use_panel_for_metric(self, metric_identifier: str) -> bool:
167187
"""Determine if a metric should use judge panel based on enabled_metrics.
168188

src/lightspeed_evaluation/core/models/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
from lightspeed_evaluation.core.models.data import (
99
EvaluationData,
1010
EvaluationRequest,
11-
MetricResult,
1211
EvaluationResult,
1312
EvaluationScope,
13+
JudgeScore,
14+
MetricResult,
1415
TurnData,
1516
)
1617
from lightspeed_evaluation.core.models.mixins import StreamingMetricsMixin
@@ -34,6 +35,7 @@
3435
"TurnData",
3536
"EvaluationData",
3637
"EvaluationRequest",
38+
"JudgeScore",
3739
"MetricResult",
3840
"EvaluationResult",
3941
"EvaluationScope",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,24 @@ def validate_conversation_metrics(
419419
return v
420420

421421

422+
class JudgeScore(BaseModel):
423+
"""Model for individual judge evaluation score in a judge panel.
424+
425+
Used when multiple judges evaluate the same metric, storing per-judge
426+
details for transparency and analysis.
427+
"""
428+
429+
judge_id: str = Field(
430+
..., min_length=1, description="Judge identifier (model ID from llm_pool)"
431+
)
432+
score: Optional[float] = Field(
433+
default=None, ge=0.0, le=1.0, description="Score between 0 and 1"
434+
)
435+
reason: str = Field(default="", description="Explanation from this judge")
436+
input_tokens: int = Field(default=0, ge=0, description="Input tokens used")
437+
output_tokens: int = Field(default=0, ge=0, description="Output tokens used")
438+
439+
422440
class MetricResult(BaseModel):
423441
"""Model for framework metric result."""
424442

@@ -439,6 +457,10 @@ class MetricResult(BaseModel):
439457
judge_llm_output_tokens: int = Field(
440458
default=0, ge=0, description="Judge LLM output tokens used"
441459
)
460+
judge_scores: Optional[list[JudgeScore]] = Field(
461+
default=None,
462+
description="Per-judge scores when using judge panel (for transparency)",
463+
)
442464

443465
@field_validator("result")
444466
@classmethod

src/lightspeed_evaluation/core/models/system.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -581,11 +581,11 @@ class JudgePanelConfig(BaseModel):
581581
),
582582
)
583583
aggregation_strategy: str = Field(
584-
default="average",
584+
default="max",
585585
description=(
586586
"Strategy for aggregating scores from multiple judges. "
587587
"Options: 'max', 'average', 'majority_vote'. "
588-
"Note: Currently unused - will be implemented later."
588+
"Note: Currently only 'max' is implemented; others coming soon."
589589
),
590590
)
591591

@@ -817,16 +817,16 @@ def validate_default_metrics_metadata_geval(
817817
) from e
818818
return v
819819

820-
def get_judge_configs(self) -> list[LLMConfig]:
821-
"""Get resolved LLMConfig for all judges.
820+
def get_judge_configs(self) -> list[tuple[str, LLMConfig]]:
821+
"""Get resolved LLMConfig for all judges with their pool keys.
822822
823823
Returns:
824-
List of LLMConfig objects for each judge.
824+
List of (pool_key, LLMConfig) tuples for each judge.
825825
If judge_panel is configured, resolves from llm_pool.
826-
Otherwise, returns single llm config.
826+
Otherwise, returns single entry with "primary" as key.
827827
"""
828828
if not self.judge_panel:
829-
return [self.llm]
829+
return [("primary", self.llm)]
830830

831831
if not self.llm_pool:
832832
raise ConfigurationError(
@@ -840,7 +840,7 @@ def get_judge_configs(self) -> list[LLMConfig]:
840840
config = self.llm_pool.resolve_llm_config(
841841
judge_id, cache_suffix=cache_suffix
842842
)
843-
configs.append(config)
843+
configs.append((judge_id, config))
844844
return configs
845845

846846
def get_llm_config(

src/lightspeed_evaluation/core/output/generator.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,13 @@ def _generate_csv_report(
196196
# Special formatting for execution_time
197197
if column == "execution_time" and value is not None:
198198
row_data.append(f"{value:.3f}")
199+
# Convert judge_scores to JSON string
200+
elif column == "judge_scores" and value is not None:
201+
row_data.append(
202+
json.dumps(
203+
[js.model_dump() for js in value], default=str
204+
)
205+
)
199206
else:
200207
row_data.append(value)
201208
else:
@@ -254,6 +261,12 @@ def _generate_json_summary( # pylint: disable=too-many-arguments,too-many-posit
254261
"execution_time": round(r.execution_time, 3),
255262
"judge_llm_input_tokens": r.judge_llm_input_tokens,
256263
"judge_llm_output_tokens": r.judge_llm_output_tokens,
264+
# Judge panel scores (when using multiple judges)
265+
"judge_scores": (
266+
[js.model_dump() for js in r.judge_scores]
267+
if r.judge_scores
268+
else None
269+
),
257270
# Streaming performance metrics
258271
"time_to_first_token": r.time_to_first_token,
259272
"streaming_duration": r.streaming_duration,

src/lightspeed_evaluation/core/system/loader.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
SystemConfig,
1717
VisualizationConfig,
1818
)
19+
from lightspeed_evaluation.core.models.system import (
20+
JudgePanelConfig,
21+
LLMPoolConfig,
22+
)
1923
from lightspeed_evaluation.core.system.setup import (
2024
setup_environment_variables,
2125
setup_logging,
@@ -156,6 +160,14 @@ def load_system_config(self, config_path: str) -> SystemConfig:
156160
def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig:
157161
"""Create SystemConfig object from validated configuration data."""
158162
metrics_metadata = config_data.get("metrics_metadata", {})
163+
164+
# Parse llm_pool and judge_panel if present (Optional sections)
165+
llm_pool_data = config_data.get("llm_pool")
166+
llm_pool = LLMPoolConfig(**llm_pool_data) if llm_pool_data else None
167+
168+
judge_panel_data = config_data.get("judge_panel")
169+
judge_panel = JudgePanelConfig(**judge_panel_data) if judge_panel_data else None
170+
159171
return SystemConfig(
160172
core=CoreConfig(**config_data.get("core", {})),
161173
llm=LLMConfig(**config_data.get("llm", {})),
@@ -164,6 +176,8 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig:
164176
output=OutputConfig(**config_data.get("output", {})),
165177
logging=LoggingConfig(**config_data.get("logging", {})),
166178
visualization=VisualizationConfig(**config_data.get("visualization", {})),
179+
llm_pool=llm_pool,
180+
judge_panel=judge_panel,
167181
default_turn_metrics_metadata=metrics_metadata.get("turn_level", {}),
168182
default_conversation_metrics_metadata=metrics_metadata.get(
169183
"conversation_level", {}

0 commit comments

Comments
 (0)