Skip to content

Commit b6ab879

Browse files
authored
Merge pull request lightspeed-core#163 from asamal4/missing-metric-metadata
[LEADS-230] fix: missing metric_metadata value in csv
2 parents 4f7c900 + bc4ed3a commit b6ab879

5 files changed

Lines changed: 13 additions & 14 deletions

File tree

config/system.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,10 +188,10 @@ output:
188188
- "conversation_group_id"
189189
- "turn_id"
190190
- "metric_identifier"
191+
- "metric_metadata"
191192
- "result"
192193
- "score"
193194
- "threshold"
194-
- "metric_metadata"
195195
- "reason"
196196
- "execution_time"
197197
- "query"

src/lightspeed_evaluation/core/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,10 @@
8282
"tag",
8383
"turn_id",
8484
"metric_identifier",
85+
"metric_metadata",
8586
"result",
8687
"score",
8788
"threshold",
88-
"metric_metadata",
8989
"reason",
9090
"query",
9191
"response",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,10 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
470470
min_length=1,
471471
description="Metric identifier (e.g., 'ragas:response_relevancy')",
472472
)
473+
metric_metadata: Optional[str] = Field(
474+
default=None,
475+
description="Metric metadata for evaluation (JSON, excludes identifier and threshold)",
476+
)
473477
query: str = Field(default="", description="Query text")
474478
response: str = Field(default="", description="Response text")
475479
execution_time: float = Field(
@@ -499,10 +503,6 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
499503
expected_tool_calls: Optional[str] = Field(
500504
default=None, description="Expected tool calls formatted as string"
501505
)
502-
metrics_metadata: Optional[str] = Field(
503-
default=None,
504-
description="Additional metric metadata (JSON-encoded key-value pairs)",
505-
)
506506

507507

508508
class EvaluationScope(BaseModel):

src/lightspeed_evaluation/pipeline/evaluation/evaluator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def evaluate_metric( # pylint: disable=too-many-locals
142142
tag=request.conv_data.tag,
143143
turn_id=request.turn_id,
144144
metric_identifier=request.metric_identifier,
145+
metric_metadata=self._extract_metadata_for_csv(request),
145146
query=turn_data.query if turn_data else "",
146147
response=turn_data.response or "" if turn_data else "",
147148
execution_time=execution_time,
@@ -169,7 +170,6 @@ def evaluate_metric( # pylint: disable=too-many-locals
169170
expected_tool_calls=(
170171
_to_json_str(turn_data.expected_tool_calls) if turn_data else None
171172
),
172-
metrics_metadata=self._extract_metadata_for_csv(request),
173173
)
174174

175175
except EvaluationError as e:
@@ -495,6 +495,7 @@ def _create_error_result(
495495
tag=request.conv_data.tag,
496496
turn_id=request.turn_id,
497497
metric_identifier=request.metric_identifier,
498+
metric_metadata=self._extract_metadata_for_csv(request),
498499
result="ERROR",
499500
score=None,
500501
threshold=None,
@@ -508,7 +509,6 @@ def _create_error_result(
508509
time_to_first_token=turn_data.time_to_first_token if turn_data else None,
509510
streaming_duration=turn_data.streaming_duration if turn_data else None,
510511
tokens_per_second=turn_data.tokens_per_second if turn_data else None,
511-
metrics_metadata=self._extract_metadata_for_csv(request),
512512
)
513513

514514
def _determine_status(self, score: float, threshold: Optional[float]) -> str:

tests/unit/core/output/test_generator.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -344,20 +344,20 @@ def test_generate_csv_with_specific_results(
344344
self, tmp_path: Path, mocker: MockerFixture
345345
) -> None:
346346
"""Test CSV report generation with specific results."""
347+
metric_metadata = '{"max_ngram": 4}'
347348
results = [
348349
EvaluationResult(
349350
conversation_group_id="test_conv",
350351
turn_id="turn1",
351-
metric_identifier="test:metric",
352+
metric_identifier="nlp:bleu",
353+
metric_metadata=metric_metadata,
352354
result="PASS",
353355
score=0.8,
354356
threshold=0.7,
355-
reason="Good performance",
357+
reason="Score is 0.8",
356358
query="What is OpenShift?",
357359
response="OpenShift is a container platform.",
358360
execution_time=1.5,
359-
contexts='["OpenShift context"]',
360-
expected_keywords='[["OpenShift", "container"]]',
361361
),
362362
EvaluationResult(
363363
conversation_group_id="test_conv",
@@ -404,8 +404,7 @@ def test_generate_csv_with_specific_results(
404404
assert rows[0]["result"] == "PASS"
405405
assert rows[0]["query"] == "What is OpenShift?"
406406
assert rows[0]["response"] == "OpenShift is a container platform."
407-
assert rows[0]["contexts"] == '["OpenShift context"]'
408-
assert rows[0]["expected_keywords"] == '[["OpenShift", "container"]]'
407+
assert rows[0]["metric_metadata"] == metric_metadata
409408

410409
assert rows[1]["result"] == "FAIL"
411410
assert rows[1]["expected_response"] == "Use oc apply -f deployment.yaml"

0 commit comments

Comments
 (0)