Skip to content

Commit eb6d4bf

Browse files
dmaniloffclaude
andauthored
Upgrade to llama-stack 0.6.0 and ragas 0.4.x (#64)
* Upgrade to llama-stack 0.6.0 and ragas 0.4.x - Bump provider version to 0.7.0 targeting llama-stack >=0.6.0 - Upgrade ragas from ==0.3.0 to >=0.4.0,<0.5.0 - Add 6 new metrics: AnswerAccuracy, ContextRelevance, FactualCorrectness, NoiseSensitivity, ResponseGroundedness, context_entity_recall - Implement is_finished() on LLM wrappers (now required by BaseRagasLLM) - Fix test fixture metric name (semantic_similarity -> answer_similarity) - Update COMPATIBILITY.md with release/0.6.x branch and version entries Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Post-review fixes for ragas 0.4.x upgrade - Fix EvaluationResult import in kubeflow components (ragas.dataset_schema → ragas.evaluation) - Remove stale commented-out is_finished code from inline wrappers - Eliminate deprecation-triggering lazy imports in base._get_metrics by using METRIC_MAPPING Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Address Sourcery review: guard default metrics, fix is_finished fallback, add tests - Guard _DEFAULT_METRICS against METRIC_MAPPING drift with .get() + warning - Replace unconditional `return True` in is_finished with content-based check - Add unit tests for _get_metrics (6 tests) and is_finished (8 tests) - Add nv_accuracy (AnswerAccuracy) to benchmark scoring_functions and test_direct_evaluation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Remove deprecation warning suppression for ragas.metrics imports Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6c20416 commit eb6d4bf

14 files changed

Lines changed: 529 additions & 182 deletions

File tree

COMPATIBILITY.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@ branches are organized by llama-stack compatibility:
1515
|------------------|--------------------|-------------------|
1616
| `release/0.4.x` | 0.3.x | 0.4.3+ |
1717
| `release/0.5.x` | 0.4.x | 0.5.4+ |
18-
| `main` | 0.5.x+ | 0.6.0+ |
18+
| `release/0.6.x` | 0.5.x | 0.6.0+ |
19+
| `main` | 0.6.x+ | 0.7.0+ |
1920

2021
## Version Compatibility Table
2122

2223
| Provider Version | Llama-Stack Dependency | Python | Release Branch | Notes |
2324
|------------------|-------------------------------|---------|------------------|--------------------------------------|
24-
| 0.6.0 | >=0.5.0 | >=3.12 | `main` | Current latest release |
25+
| 0.7.0 | >=0.6.0 | >=3.12 | `main` | Current latest release |
26+
| 0.6.1 | >=0.5.0 | >=3.12 | `release/0.6.x` | Maintenance release for lls 0.5.x |
2527
| 0.5.4 | [client]>=0.4.2,<0.5.0 | >=3.12 | `release/0.5.x` | Maintenance release for lls 0.4.x |
2628
| 0.4.3 | [client]>=0.3.5,<0.4.0 | >=3.12 | `release/0.4.x` | Maintenance release for lls 0.3.x |
2729
| 0.5.1 | >0.2.23 (loose) | >=3.12 || Legacy; use 0.4.3 for lls 0.3.x |
@@ -42,6 +44,7 @@ branches are organized by llama-stack compatibility:
4244
If you need to target a specific llama-stack version, use the following
4345
provider versions:
4446

45-
- **llama-stack 0.5.x**: use provider `>=0.6.0` (`pip install llama-stack-provider-ragas>=0.6.0`)
47+
- **llama-stack 0.6.x**: use provider `>=0.7.0` (`pip install llama-stack-provider-ragas>=0.7.0`)
48+
- **llama-stack 0.5.x**: use provider `==0.6.1` (`pip install llama-stack-provider-ragas==0.6.1`)
4649
- **llama-stack 0.4.x**: use provider `==0.5.4` (`pip install llama-stack-provider-ragas==0.5.4`)
4750
- **llama-stack 0.3.x**: use provider `==0.4.3` (`pip install llama-stack-provider-ragas==0.4.3`)

pyproject.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "llama-stack-provider-ragas"
7-
version = "0.6.1"
7+
version = "0.7.0"
88
description = "Ragas evaluation as an out-of-tree Llama Stack provider"
99
readme = "README.md"
1010
requires-python = ">=3.12"
@@ -25,15 +25,15 @@ authors = [
2525
keywords = ["llama-stack", "ragas", "evaluation"]
2626
dependencies = [
2727
"setuptools-scm",
28-
"llama-stack>=0.5.0",
29-
"llama-stack-api>=0.5.0",
28+
"llama-stack>=0.6.0",
29+
"llama-stack-api>=0.6.0",
3030
"greenlet==3.2.4", # inline/files/localfs errors saying greenlet not found
31-
"ragas==0.3.0",
31+
"ragas>=0.4.0,<0.5.0",
3232
"pandas<2.4.0",
3333
"pyarrow>=21.0.0",
3434
"requests>=2.32.5",
3535
"datasets>=2.16.0",
36-
"llama-stack-client>=0.5.0",
36+
"llama-stack-client>=0.6.0",
3737
]
3838

3939
[project.urls]

src/llama_stack_provider_ragas/base.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,13 @@ class RagasEvaluatorBase(Eval, BenchmarksProtocolPrivate, ABC):
3939
def __init__(self):
4040
self.benchmarks: dict[str, Benchmark] = {}
4141

42+
_DEFAULT_METRICS = [
43+
"answer_relevancy",
44+
"context_precision",
45+
"faithfulness",
46+
"context_recall",
47+
]
48+
4249
def _get_metrics(self, scoring_functions: list[str]) -> list:
4350
"""Get the list of metrics to run based on scoring functions.
4451
@@ -48,13 +55,6 @@ def _get_metrics(self, scoring_functions: list[str]) -> list:
4855
Returns:
4956
List of metrics (unconfigured - ragas_evaluate will configure them)
5057
"""
51-
from ragas.metrics import (
52-
answer_relevancy,
53-
context_precision,
54-
context_recall,
55-
faithfulness,
56-
)
57-
5858
metrics = []
5959

6060
for metric_name in scoring_functions:
@@ -65,14 +65,19 @@ def _get_metrics(self, scoring_functions: list[str]) -> list:
6565
logger.warning(f"Unknown metric: {metric_name}")
6666

6767
if not metrics:
68-
# Use default metrics if none specified or all invalid
6968
logger.info("Using default metrics")
70-
metrics = [
71-
answer_relevancy,
72-
context_precision,
73-
faithfulness,
74-
context_recall,
75-
]
69+
for name in self._DEFAULT_METRICS:
70+
if name in METRIC_MAPPING:
71+
metrics.append(METRIC_MAPPING[name])
72+
else:
73+
logger.warning(
74+
f"Default metric not found in METRIC_MAPPING: {name}"
75+
)
76+
if not metrics:
77+
raise RagasEvaluationError(
78+
"No valid default metrics found. Check that _DEFAULT_METRICS "
79+
"keys match METRIC_MAPPING entries."
80+
)
7681

7782
return metrics
7883

src/llama_stack_provider_ragas/constants.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
from ragas.metrics import (
2+
AnswerAccuracy,
3+
ContextRelevance,
4+
FactualCorrectness,
5+
NoiseSensitivity,
6+
ResponseGroundedness,
27
answer_relevancy,
38
answer_similarity,
9+
context_entity_recall,
410
context_precision,
511
context_recall,
612
faithfulness,
@@ -10,18 +16,28 @@
1016
PROVIDER_ID_INLINE = "trustyai_ragas_inline"
1117
PROVIDER_ID_REMOTE = "trustyai_ragas_remote"
1218

13-
METRIC_MAPPING = {
14-
metric_func.name: metric_func
15-
for metric_func in [
16-
answer_relevancy,
17-
answer_similarity,
18-
context_precision,
19-
faithfulness,
20-
context_recall,
21-
# Can add other metrics here, e.g.:
22-
# "rouge_score": RougeScore(),
23-
]
24-
}
19+
# Pre-instantiated metric singletons (from ragas)
20+
_SINGLETON_METRICS = [
21+
answer_relevancy,
22+
answer_similarity,
23+
context_precision,
24+
faithfulness,
25+
context_recall,
26+
context_entity_recall,
27+
]
28+
29+
# Class-based metrics (new in ragas v0.4.x) that need instantiation.
30+
# Note: BleuScore, ChrfScore, and RougeScore are omitted because they
31+
# require optional dependencies (sacrebleu, rouge_score).
32+
_CLASS_METRICS = [
33+
AnswerAccuracy(),
34+
ContextRelevance(),
35+
FactualCorrectness(),
36+
NoiseSensitivity(),
37+
ResponseGroundedness(),
38+
]
39+
40+
METRIC_MAPPING = {m.name: m for m in _SINGLETON_METRICS + _CLASS_METRICS}
2541
AVAILABLE_METRICS = list(METRIC_MAPPING.keys())
2642

2743
# Kubeflow ConfigMap keys and defaults for base image resolution

src/llama_stack_provider_ragas/inline/provider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ def get_provider_spec() -> ProviderSpec:
66
return InlineProviderSpec(
77
api=Api.eval,
88
provider_type=f"inline::{PROVIDER_TYPE}",
9-
pip_packages=["ragas==0.3.0"],
9+
pip_packages=["ragas>=0.4.0,<0.5.0"],
1010
config_class="llama_stack_provider_ragas.config.RagasProviderInlineConfig",
1111
module="llama_stack_provider_ragas.inline",
1212
api_dependencies=[

src/llama_stack_provider_ragas/inline/wrappers_inline.py

Lines changed: 12 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -158,94 +158,15 @@ async def agenerate_text(
158158
logger.error(f"LLM generation failed: {str(e)}")
159159
raise
160160

161-
# TODO: revisit this
162-
# def is_finished(self, response: LLMResult) -> bool:
163-
# """
164-
# Check if the LLM generation completed successfully.
165-
166-
# For Llama Stack responses, we check if the generation was completed
167-
# without hitting token limits or other issues.
168-
# """
169-
# try:
170-
# # First, check if we have Llama Stack specific information in llm_output
171-
# if response.llm_output and "llama_stack_responses" in response.llm_output:
172-
# llama_stack_responses = response.llm_output["llama_stack_responses"]
173-
174-
# for i, llama_response in enumerate(llama_stack_responses):
175-
# stop_reason = llama_response.get("stop_reason")
176-
# content_length = llama_response.get("content_length", 0)
177-
178-
# # Check stop_reason from Llama Stack response
179-
# if stop_reason == "out_of_tokens":
180-
# logger.warning(
181-
# f"Generation {i} hit token limit (stop_reason: {stop_reason})"
182-
# )
183-
# return False
184-
# elif stop_reason == "end_of_message":
185-
# # This is usually fine for tool calls, but might indicate incomplete generation
186-
# logger.info(
187-
# f"Generation {i} ended with end_of_message (stop_reason: {stop_reason})"
188-
# )
189-
# elif stop_reason == "end_of_turn":
190-
# # This is the ideal case - normal completion
191-
# logger.debug(
192-
# f"Generation {i} completed normally (stop_reason: {stop_reason})"
193-
# )
194-
# elif stop_reason is None:
195-
# logger.warning(f"Generation {i} has no stop_reason")
196-
# return False
197-
198-
# # Check content length
199-
# if content_length == 0:
200-
# logger.warning(f"Generation {i} has empty content")
201-
# return False
202-
# elif content_length < 10:
203-
# logger.warning(
204-
# f"Generation {i} has very short content ({content_length} chars)"
205-
# )
206-
# return False
207-
208-
# # If we have Llama Stack info and all checks pass, we're done
209-
# return True
210-
211-
# # Fallback to content-based validation if no Llama Stack info
212-
# for generation_list in response.generations:
213-
# for generation in generation_list:
214-
# # Check if the generated text is empty or None
215-
# if not generation.text or generation.text.strip() == "":
216-
# logger.warning("Empty response from Llama Stack LLM")
217-
# return False
218-
219-
# # Check if the response indicates an error or incomplete generation
220-
# if any(
221-
# error_indicator in generation.text.lower()
222-
# for error_indicator in [
223-
# "error",
224-
# "failed",
225-
# "timeout",
226-
# "incomplete",
227-
# "truncated",
228-
# ]
229-
# ):
230-
# logger.warning(
231-
# f"Response indicates error or incomplete generation: {generation.text[:100]}..."
232-
# )
233-
# return False
234-
235-
# # Check for common truncation indicators
236-
# if generation.text.endswith("...") or generation.text.endswith("…"):
237-
# logger.warning("Response appears to be truncated")
238-
# return False
239-
240-
# # Check if the response is too short (might indicate truncation)
241-
# if len(generation.text.strip()) < 10:
242-
# logger.warning("Response is very short, might be incomplete")
243-
# return False
244-
245-
# # If we get here, all generations look good
246-
# return True
247-
248-
# except Exception as e:
249-
# logger.error(f"Error checking if LLM generation is finished: {str(e)}")
250-
# # Default to True to avoid false positives, but log the error
251-
# return True
161+
def is_finished(self, response: LLMResult) -> bool:
162+
"""Check if the LLM generation completed successfully."""
163+
if response.llm_output and "llama_stack_responses" in response.llm_output:
164+
return all(
165+
r.get("stop_reason") not in (None, "out_of_tokens")
166+
for r in response.llm_output["llama_stack_responses"]
167+
)
168+
return bool(
169+
response.generations
170+
and response.generations[0]
171+
and any(g.text for g in response.generations[0])
172+
)

src/llama_stack_provider_ragas/remote/kubeflow/components.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def run_ragas_evaluation(
9595

9696
import pandas as pd
9797
from ragas import EvaluationDataset, evaluate
98-
from ragas.dataset_schema import EvaluationResult
98+
from ragas.evaluation import EvaluationResult
9999
from ragas.run_config import RunConfig
100100

101101
from llama_stack_provider_ragas.compat import SamplingParams

src/llama_stack_provider_ragas/remote/provider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def get_provider_spec() -> ProviderSpec:
99
adapter_type=PROVIDER_TYPE,
1010
module="llama_stack_provider_ragas.remote",
1111
pip_packages=[
12-
"ragas==0.3.0",
12+
"ragas>=0.4.0,<0.5.0",
1313
"kfp>=2.5.0",
1414
"kfp-kubernetes>=2.0.0",
1515
"s3fs>=2024.12.0",

src/llama_stack_provider_ragas/remote/wrappers_remote.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,19 @@ async def agenerate_text(
240240
logger.error(f"Async LLM generation failed: {str(e)}")
241241
raise
242242

243+
def is_finished(self, response: LLMResult) -> bool:
244+
"""Check if the LLM generation completed successfully."""
245+
if response.llm_output and "llama_stack_responses" in response.llm_output:
246+
return all(
247+
r.get("stop_reason") not in (None, "out_of_tokens")
248+
for r in response.llm_output["llama_stack_responses"]
249+
)
250+
return bool(
251+
response.generations
252+
and response.generations[0]
253+
and any(g.text for g in response.generations[0])
254+
)
255+
243256
def get_temperature(self, n: int) -> float:
244257
"""Get temperature based on number of completions."""
245258
return 0.3 if n > 1 else 1e-8

tests/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,13 @@ def register_benchmarks(
103103
client.alpha.benchmarks.register(
104104
benchmark_id=inline_benchmark_id,
105105
dataset_id=dataset_id,
106-
scoring_functions=["semantic_similarity"],
106+
scoring_functions=["answer_similarity", "nv_accuracy"],
107107
provider_id="trustyai_ragas_inline",
108108
)
109109
client.alpha.benchmarks.register(
110110
benchmark_id=remote_benchmark_id,
111111
dataset_id=dataset_id,
112-
scoring_functions=["semantic_similarity"],
112+
scoring_functions=["answer_similarity", "nv_accuracy"],
113113
provider_id="trustyai_ragas_remote",
114114
)
115115
yield

0 commit comments

Comments
 (0)