Anxhela21
diff --git a/‎src/lightspeed_evaluation/core/constants.py‎
Lines changed: 5 additions & 0 deletions b/‎src/lightspeed_evaluation/core/constants.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/llm/manager.py‎
Lines changed: 22 additions & 2 deletions b/‎src/lightspeed_evaluation/core/llm/manager.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎src/lightspeed_evaluation/core/models/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎src/lightspeed_evaluation/core/models/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/lightspeed_evaluation/core/models/data.py‎
Lines changed: 22 additions & 0 deletions b/‎src/lightspeed_evaluation/core/models/data.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/models/system.py‎
Lines changed: 8 additions & 8 deletions b/‎src/lightspeed_evaluation/core/models/system.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/lightspeed_evaluation/core/output/generator.py‎
Lines changed: 13 additions & 0 deletions b/‎src/lightspeed_evaluation/core/output/generator.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/core/system/loader.py‎
Lines changed: 14 additions & 0 deletions b/‎src/lightspeed_evaluation/core/system/loader.py‎
Lines changed: 14 additions & 0 deletions
@@ -60,6 +60,9 @@
 
 DEFAULT_API_NUM_RETRIES = 3
 
+# Frameworks that don't require judge LLM (NLP, script-based evaluations)
+NON_LLM_FRAMEWORKS = frozenset({"nlp", "script"})
+
 DEFAULT_LLM_PROVIDER = "openai"
 DEFAULT_LLM_MODEL = "gpt-4o-mini"
 DEFAULT_SSL_VERIFY = True
@@ -96,6 +99,8 @@
     "api_output_tokens",
     "judge_llm_input_tokens",
     "judge_llm_output_tokens",
+    # Per-judge scores (JSON array with one entry for single judge)
+    "judge_scores",
     # Streaming performance metrics
     "time_to_first_token",
     "streaming_duration",
 
@@ -24,16 +24,20 @@ def __init__(
         self,
         config: LLMConfig,
         system_config: Optional[SystemConfig] = None,
+        judge_id: Optional[str] = None,
     ):
         """Initialize with validated environment and constructed model name.
 
         Args:
             config: Primary LLM configuration (also used as fallback)
             system_config: Optional full system config for judge panel support
+            judge_id: Optional identifier for this judge (pool key). If not provided,
+                defaults to "primary" for single LLM or the pool key for panel judges.
         """
         self.config = config
         self.system_config = system_config
         self.model_name = self._construct_model_name_and_validate(config)
+        self.judge_id = judge_id or "primary"
 
         # Initialize judge panel if available
         self.judge_managers: list["LLMManager"] = []
@@ -43,9 +47,9 @@ def __init__(
             # Create LLM managers for each judge using resolved configs from llms pool
             try:
                 judge_configs = system_config.get_judge_configs()
-                for resolved_config in judge_configs:
+                for pool_key, resolved_config in judge_configs:
                     # Create child manager without system_config to avoid recursion
-                    judge_manager = LLMManager(resolved_config)
+                    judge_manager = LLMManager(resolved_config, judge_id=pool_key)
                     self.judge_managers.append(judge_manager)
             except ValueError as e:
                 logger.error("Failed to resolve judge panel: %s", e)
@@ -163,6 +167,22 @@ def get_primary_judge(self) -> "LLMManager":
             return self.judge_managers[0]
         return self
 
+    def get_judges_for_metric(self, metric_identifier: str) -> list["LLMManager"]:
+        """Get list of judges to use for a specific metric.
+
+        Returns all judges if metric should use panel, otherwise returns
+        list with single primary judge. Always returns a list.
+
+        Args:
+            metric_identifier: Metric identifier (e.g., "ragas:faithfulness")
+
+        Returns:
+            List of LLMManager instances to use for this metric
+        """
+        if self.should_use_panel_for_metric(metric_identifier):
+            return self.get_judge_managers()
+        return [self.get_primary_judge()]
+
     def should_use_panel_for_metric(self, metric_identifier: str) -> bool:
         """Determine if a metric should use judge panel based on enabled_metrics.
 
 
@@ -8,9 +8,10 @@
 from lightspeed_evaluation.core.models.data import (
     EvaluationData,
     EvaluationRequest,
-    MetricResult,
     EvaluationResult,
     EvaluationScope,
+    JudgeScore,
+    MetricResult,
     TurnData,
 )
 from lightspeed_evaluation.core.models.mixins import StreamingMetricsMixin
@@ -34,6 +35,7 @@
     "TurnData",
     "EvaluationData",
     "EvaluationRequest",
+    "JudgeScore",
     "MetricResult",
     "EvaluationResult",
     "EvaluationScope",
 
@@ -419,6 +419,24 @@ def validate_conversation_metrics(
         return v
 
 
+class JudgeScore(BaseModel):
+    """Model for individual judge evaluation score in a judge panel.
+
+    Used when multiple judges evaluate the same metric, storing per-judge
+    details for transparency and analysis.
+    """
+
+    judge_id: str = Field(
+        ..., min_length=1, description="Judge identifier (model ID from llm_pool)"
+    )
+    score: Optional[float] = Field(
+        default=None, ge=0.0, le=1.0, description="Score between 0 and 1"
+    )
+    reason: str = Field(default="", description="Explanation from this judge")
+    input_tokens: int = Field(default=0, ge=0, description="Input tokens used")
+    output_tokens: int = Field(default=0, ge=0, description="Output tokens used")
+
+
 class MetricResult(BaseModel):
     """Model for framework metric result."""
 
@@ -439,6 +457,10 @@ class MetricResult(BaseModel):
     judge_llm_output_tokens: int = Field(
         default=0, ge=0, description="Judge LLM output tokens used"
     )
+    judge_scores: Optional[list[JudgeScore]] = Field(
+        default=None,
+        description="Per-judge scores when using judge panel (for transparency)",
+    )
 
     @field_validator("result")
     @classmethod
 
@@ -581,11 +581,11 @@ class JudgePanelConfig(BaseModel):
         ),
     )
     aggregation_strategy: str = Field(
-        default="average",
+        default="max",
         description=(
             "Strategy for aggregating scores from multiple judges. "
             "Options: 'max', 'average', 'majority_vote'. "
-            "Note: Currently unused - will be implemented later."
+            "Note: Currently only 'max' is implemented; others coming soon."
         ),
     )
 
@@ -817,16 +817,16 @@ def validate_default_metrics_metadata_geval(
                     ) from e
         return v
 
-    def get_judge_configs(self) -> list[LLMConfig]:
-        """Get resolved LLMConfig for all judges.
+    def get_judge_configs(self) -> list[tuple[str, LLMConfig]]:
+        """Get resolved LLMConfig for all judges with their pool keys.
 
         Returns:
-            List of LLMConfig objects for each judge.
+            List of (pool_key, LLMConfig) tuples for each judge.
             If judge_panel is configured, resolves from llm_pool.
-            Otherwise, returns single llm config.
+            Otherwise, returns single entry with "primary" as key.
         """
         if not self.judge_panel:
-            return [self.llm]
+            return [("primary", self.llm)]
 
         if not self.llm_pool:
             raise ConfigurationError(
@@ -840,7 +840,7 @@ def get_judge_configs(self) -> list[LLMConfig]:
             config = self.llm_pool.resolve_llm_config(
                 judge_id, cache_suffix=cache_suffix
             )
-            configs.append(config)
+            configs.append((judge_id, config))
         return configs
 
     def get_llm_config(
 
@@ -196,6 +196,13 @@ def _generate_csv_report(
                         # Special formatting for execution_time
                         if column == "execution_time" and value is not None:
                             row_data.append(f"{value:.3f}")
+                        # Convert judge_scores to JSON string
+                        elif column == "judge_scores" and value is not None:
+                            row_data.append(
+                                json.dumps(
+                                    [js.model_dump() for js in value], default=str
+                                )
+                            )
                         else:
                             row_data.append(value)
                     else:
@@ -254,6 +261,12 @@ def _generate_json_summary(  # pylint: disable=too-many-arguments,too-many-posit
                     "execution_time": round(r.execution_time, 3),
                     "judge_llm_input_tokens": r.judge_llm_input_tokens,
                     "judge_llm_output_tokens": r.judge_llm_output_tokens,
+                    # Judge panel scores (when using multiple judges)
+                    "judge_scores": (
+                        [js.model_dump() for js in r.judge_scores]
+                        if r.judge_scores
+                        else None
+                    ),
                     # Streaming performance metrics
                     "time_to_first_token": r.time_to_first_token,
                     "streaming_duration": r.streaming_duration,
 
@@ -16,6 +16,10 @@
     SystemConfig,
     VisualizationConfig,
 )
+from lightspeed_evaluation.core.models.system import (
+    JudgePanelConfig,
+    LLMPoolConfig,
+)
 from lightspeed_evaluation.core.system.setup import (
     setup_environment_variables,
     setup_logging,
@@ -156,6 +160,14 @@ def load_system_config(self, config_path: str) -> SystemConfig:
     def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig:
         """Create SystemConfig object from validated configuration data."""
         metrics_metadata = config_data.get("metrics_metadata", {})
+
+        # Parse llm_pool and judge_panel if present (Optional sections)
+        llm_pool_data = config_data.get("llm_pool")
+        llm_pool = LLMPoolConfig(**llm_pool_data) if llm_pool_data else None
+
+        judge_panel_data = config_data.get("judge_panel")
+        judge_panel = JudgePanelConfig(**judge_panel_data) if judge_panel_data else None
+
         return SystemConfig(
             core=CoreConfig(**config_data.get("core", {})),
             llm=LLMConfig(**config_data.get("llm", {})),
@@ -164,6 +176,8 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig:
             output=OutputConfig(**config_data.get("output", {})),
             logging=LoggingConfig(**config_data.get("logging", {})),
             visualization=VisualizationConfig(**config_data.get("visualization", {})),
+            llm_pool=llm_pool,
+            judge_panel=judge_panel,
             default_turn_metrics_metadata=metrics_metadata.get("turn_level", {}),
             default_conversation_metrics_metadata=metrics_metadata.get(
                 "conversation_level", {}