Upgrade to llama-stack 0.6.0 and ragas 0.4.x (#64)

dmaniloff · claude · web-flow · commit eb6d4bf2c765 · 2026-05-11T10:58:51.000-04:00
* Upgrade to llama-stack 0.6.0 and ragas 0.4.x

- Bump provider version to 0.7.0 targeting llama-stack &gt;=0.6.0
- Upgrade ragas from ==0.3.0 to &gt;=0.4.0,&lt;0.5.0
- Add 6 new metrics: AnswerAccuracy, ContextRelevance, FactualCorrectness,
  NoiseSensitivity, ResponseGroundedness, context_entity_recall
- Implement is_finished() on LLM wrappers (now required by BaseRagasLLM)
- Fix test fixture metric name (semantic_similarity -&gt; answer_similarity)
- Update COMPATIBILITY.md with release/0.6.x branch and version entries

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* Post-review fixes for ragas 0.4.x upgrade

- Fix EvaluationResult import in kubeflow components (ragas.dataset_schema → ragas.evaluation)
- Remove stale commented-out is_finished code from inline wrappers
- Eliminate deprecation-triggering lazy imports in base._get_metrics by using METRIC_MAPPING

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* Address Sourcery review: guard default metrics, fix is_finished fallback, add tests

- Guard _DEFAULT_METRICS against METRIC_MAPPING drift with .get() + warning
- Replace unconditional `return True` in is_finished with content-based check
- Add unit tests for _get_metrics (6 tests) and is_finished (8 tests)
- Add nv_accuracy (AnswerAccuracy) to benchmark scoring_functions and test_direct_evaluation

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* Remove deprecation warning suppression for ragas.metrics imports

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/COMPATIBILITY.md b/COMPATIBILITY.md
@@ -15,13 +15,15 @@ branches are organized by llama-stack compatibility:
 |------------------|--------------------|-------------------|
 | `release/0.4.x`  | 0.3.x              | 0.4.3+            |
 | `release/0.5.x`  | 0.4.x              | 0.5.4+            |
-| `main`           | 0.5.x+             | 0.6.0+            |
+| `release/0.6.x`  | 0.5.x              | 0.6.0+            |
+| `main`           | 0.6.x+             | 0.7.0+            |
 
 ## Version Compatibility Table
 
 | Provider Version | Llama-Stack Dependency        | Python  | Release Branch   | Notes                                |
 |------------------|-------------------------------|---------|------------------|--------------------------------------|
-| 0.6.0            | >=0.5.0                       | >=3.12  | `main`           | Current latest release               |
+| 0.7.0            | >=0.6.0                       | >=3.12  | `main`           | Current latest release               |
+| 0.6.1            | >=0.5.0                       | >=3.12  | `release/0.6.x`  | Maintenance release for lls 0.5.x    |
 | 0.5.4            | [client]>=0.4.2,<0.5.0        | >=3.12  | `release/0.5.x`  | Maintenance release for lls 0.4.x   |
 | 0.4.3            | [client]>=0.3.5,<0.4.0        | >=3.12  | `release/0.4.x`  | Maintenance release for lls 0.3.x   |
 | 0.5.1            | >0.2.23 (loose)               | >=3.12  | —                | Legacy; use 0.4.3 for lls 0.3.x     |
@@ -42,6 +44,7 @@ branches are organized by llama-stack compatibility:
 If you need to target a specific llama-stack version, use the following
 provider versions:
 
-- **llama-stack 0.5.x**: use provider `>=0.6.0` (`pip install llama-stack-provider-ragas>=0.6.0`)
+- **llama-stack 0.6.x**: use provider `>=0.7.0` (`pip install llama-stack-provider-ragas>=0.7.0`)
+- **llama-stack 0.5.x**: use provider `==0.6.1` (`pip install llama-stack-provider-ragas==0.6.1`)
 - **llama-stack 0.4.x**: use provider `==0.5.4` (`pip install llama-stack-provider-ragas==0.5.4`)
 - **llama-stack 0.3.x**: use provider `==0.4.3` (`pip install llama-stack-provider-ragas==0.4.3`)
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "llama-stack-provider-ragas"
-version = "0.6.1"
+version = "0.7.0"
 description = "Ragas evaluation as an out-of-tree Llama Stack provider"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -25,15 +25,15 @@ authors = [
 keywords = ["llama-stack", "ragas", "evaluation"]
 dependencies = [
     "setuptools-scm",
-    "llama-stack>=0.5.0",
-    "llama-stack-api>=0.5.0",
+    "llama-stack>=0.6.0",
+    "llama-stack-api>=0.6.0",
     "greenlet==3.2.4", # inline/files/localfs errors saying greenlet not found
-    "ragas==0.3.0",
+    "ragas>=0.4.0,<0.5.0",
     "pandas<2.4.0",
     "pyarrow>=21.0.0",
     "requests>=2.32.5",
     "datasets>=2.16.0",
-    "llama-stack-client>=0.5.0",
+    "llama-stack-client>=0.6.0",
 ]
 
 [project.urls]
diff --git a/src/llama_stack_provider_ragas/base.py b/src/llama_stack_provider_ragas/base.py
@@ -39,6 +39,13 @@ class RagasEvaluatorBase(Eval, BenchmarksProtocolPrivate, ABC):
     def __init__(self):
         self.benchmarks: dict[str, Benchmark] = {}
 
+    _DEFAULT_METRICS = [
+        "answer_relevancy",
+        "context_precision",
+        "faithfulness",
+        "context_recall",
+    ]
+
     def _get_metrics(self, scoring_functions: list[str]) -> list:
         """Get the list of metrics to run based on scoring functions.
 
@@ -48,13 +55,6 @@ def _get_metrics(self, scoring_functions: list[str]) -> list:
         Returns:
             List of metrics (unconfigured - ragas_evaluate will configure them)
         """
-        from ragas.metrics import (
-            answer_relevancy,
-            context_precision,
-            context_recall,
-            faithfulness,
-        )
-
         metrics = []
 
         for metric_name in scoring_functions:
@@ -65,14 +65,19 @@ def _get_metrics(self, scoring_functions: list[str]) -> list:
                 logger.warning(f"Unknown metric: {metric_name}")
 
         if not metrics:
-            # Use default metrics if none specified or all invalid
             logger.info("Using default metrics")
-            metrics = [
-                answer_relevancy,
-                context_precision,
-                faithfulness,
-                context_recall,
-            ]
+            for name in self._DEFAULT_METRICS:
+                if name in METRIC_MAPPING:
+                    metrics.append(METRIC_MAPPING[name])
+                else:
+                    logger.warning(
+                        f"Default metric not found in METRIC_MAPPING: {name}"
+                    )
+            if not metrics:
+                raise RagasEvaluationError(
+                    "No valid default metrics found. Check that _DEFAULT_METRICS "
+                    "keys match METRIC_MAPPING entries."
+                )
 
         return metrics
 
diff --git a/src/llama_stack_provider_ragas/constants.py b/src/llama_stack_provider_ragas/constants.py
@@ -1,6 +1,12 @@
 from ragas.metrics import (
+    AnswerAccuracy,
+    ContextRelevance,
+    FactualCorrectness,
+    NoiseSensitivity,
+    ResponseGroundedness,
     answer_relevancy,
     answer_similarity,
+    context_entity_recall,
     context_precision,
     context_recall,
     faithfulness,
@@ -10,18 +16,28 @@
 PROVIDER_ID_INLINE = "trustyai_ragas_inline"
 PROVIDER_ID_REMOTE = "trustyai_ragas_remote"
 
-METRIC_MAPPING = {
-    metric_func.name: metric_func
-    for metric_func in [
-        answer_relevancy,
-        answer_similarity,
-        context_precision,
-        faithfulness,
-        context_recall,
-        # Can add other metrics here, e.g.:
-        # "rouge_score": RougeScore(),
-    ]
-}
+# Pre-instantiated metric singletons (from ragas)
+_SINGLETON_METRICS = [
+    answer_relevancy,
+    answer_similarity,
+    context_precision,
+    faithfulness,
+    context_recall,
+    context_entity_recall,
+]
+
+# Class-based metrics (new in ragas v0.4.x) that need instantiation.
+# Note: BleuScore, ChrfScore, and RougeScore are omitted because they
+# require optional dependencies (sacrebleu, rouge_score).
+_CLASS_METRICS = [
+    AnswerAccuracy(),
+    ContextRelevance(),
+    FactualCorrectness(),
+    NoiseSensitivity(),
+    ResponseGroundedness(),
+]
+
+METRIC_MAPPING = {m.name: m for m in _SINGLETON_METRICS + _CLASS_METRICS}
 AVAILABLE_METRICS = list(METRIC_MAPPING.keys())
 
 # Kubeflow ConfigMap keys and defaults for base image resolution
diff --git a/src/llama_stack_provider_ragas/inline/provider.py b/src/llama_stack_provider_ragas/inline/provider.py
@@ -6,7 +6,7 @@ def get_provider_spec() -> ProviderSpec:
     return InlineProviderSpec(
         api=Api.eval,
         provider_type=f"inline::{PROVIDER_TYPE}",
-        pip_packages=["ragas==0.3.0"],
+        pip_packages=["ragas>=0.4.0,<0.5.0"],
         config_class="llama_stack_provider_ragas.config.RagasProviderInlineConfig",
         module="llama_stack_provider_ragas.inline",
         api_dependencies=[
diff --git a/src/llama_stack_provider_ragas/inline/wrappers_inline.py b/src/llama_stack_provider_ragas/inline/wrappers_inline.py
@@ -158,94 +158,15 @@ async def agenerate_text(
             logger.error(f"LLM generation failed: {str(e)}")
             raise
 
-    # TODO: revisit this
-    # def is_finished(self, response: LLMResult) -> bool:
-    #     """
-    #     Check if the LLM generation completed successfully.
-
-    #     For Llama Stack responses, we check if the generation was completed
-    #     without hitting token limits or other issues.
-    #     """
-    #     try:
-    #         # First, check if we have Llama Stack specific information in llm_output
-    #         if response.llm_output and "llama_stack_responses" in response.llm_output:
-    #             llama_stack_responses = response.llm_output["llama_stack_responses"]
-
-    #             for i, llama_response in enumerate(llama_stack_responses):
-    #                 stop_reason = llama_response.get("stop_reason")
-    #                 content_length = llama_response.get("content_length", 0)
-
-    #                 # Check stop_reason from Llama Stack response
-    #                 if stop_reason == "out_of_tokens":
-    #                     logger.warning(
-    #                         f"Generation {i} hit token limit (stop_reason: {stop_reason})"
-    #                     )
-    #                     return False
-    #                 elif stop_reason == "end_of_message":
-    #                     # This is usually fine for tool calls, but might indicate incomplete generation
-    #                     logger.info(
-    #                         f"Generation {i} ended with end_of_message (stop_reason: {stop_reason})"
-    #                     )
-    #                 elif stop_reason == "end_of_turn":
-    #                     # This is the ideal case - normal completion
-    #                     logger.debug(
-    #                         f"Generation {i} completed normally (stop_reason: {stop_reason})"
-    #                     )
-    #                 elif stop_reason is None:
-    #                     logger.warning(f"Generation {i} has no stop_reason")
-    #                     return False
-
-    #                 # Check content length
-    #                 if content_length == 0:
-    #                     logger.warning(f"Generation {i} has empty content")
-    #                     return False
-    #                 elif content_length < 10:
-    #                     logger.warning(
-    #                         f"Generation {i} has very short content ({content_length} chars)"
-    #                     )
-    #                     return False
-
-    #             # If we have Llama Stack info and all checks pass, we're done
-    #             return True
-
-    #         # Fallback to content-based validation if no Llama Stack info
-    #         for generation_list in response.generations:
-    #             for generation in generation_list:
-    #                 # Check if the generated text is empty or None
-    #                 if not generation.text or generation.text.strip() == "":
-    #                     logger.warning("Empty response from Llama Stack LLM")
-    #                     return False
-
-    #                 # Check if the response indicates an error or incomplete generation
-    #                 if any(
-    #                     error_indicator in generation.text.lower()
-    #                     for error_indicator in [
-    #                         "error",
-    #                         "failed",
-    #                         "timeout",
-    #                         "incomplete",
-    #                         "truncated",
-    #                     ]
-    #                 ):
-    #                     logger.warning(
-    #                         f"Response indicates error or incomplete generation: {generation.text[:100]}..."
-    #                     )
-    #                     return False
-
-    #                 # Check for common truncation indicators
-    #                 if generation.text.endswith("...") or generation.text.endswith("…"):
-    #                     logger.warning("Response appears to be truncated")
-    #                     return False
-
-    #                 # Check if the response is too short (might indicate truncation)
-    #                 if len(generation.text.strip()) < 10:
-    #                     logger.warning("Response is very short, might be incomplete")
-    #                     return False
-
-    #         # If we get here, all generations look good
-    #         return True
-
-    #     except Exception as e:
-    #         logger.error(f"Error checking if LLM generation is finished: {str(e)}")
-    #         # Default to True to avoid false positives, but log the error
-    #         return True
+    def is_finished(self, response: LLMResult) -> bool:
+        """Check if the LLM generation completed successfully."""
+        if response.llm_output and "llama_stack_responses" in response.llm_output:
+            return all(
+                r.get("stop_reason") not in (None, "out_of_tokens")
+                for r in response.llm_output["llama_stack_responses"]
+            )
+        return bool(
+            response.generations
+            and response.generations[0]
+            and any(g.text for g in response.generations[0])
+        )
diff --git a/src/llama_stack_provider_ragas/remote/kubeflow/components.py b/src/llama_stack_provider_ragas/remote/kubeflow/components.py
@@ -95,7 +95,7 @@ def run_ragas_evaluation(
 
     import pandas as pd
     from ragas import EvaluationDataset, evaluate
-    from ragas.dataset_schema import EvaluationResult
+    from ragas.evaluation import EvaluationResult
     from ragas.run_config import RunConfig
 
     from llama_stack_provider_ragas.compat import SamplingParams
diff --git a/src/llama_stack_provider_ragas/remote/provider.py b/src/llama_stack_provider_ragas/remote/provider.py
@@ -9,7 +9,7 @@ def get_provider_spec() -> ProviderSpec:
         adapter_type=PROVIDER_TYPE,
         module="llama_stack_provider_ragas.remote",
         pip_packages=[
-            "ragas==0.3.0",
+            "ragas>=0.4.0,<0.5.0",
             "kfp>=2.5.0",
             "kfp-kubernetes>=2.0.0",
             "s3fs>=2024.12.0",
diff --git a/src/llama_stack_provider_ragas/remote/wrappers_remote.py b/src/llama_stack_provider_ragas/remote/wrappers_remote.py
@@ -240,6 +240,19 @@ async def agenerate_text(
             logger.error(f"Async LLM generation failed: {str(e)}")
             raise
 
+    def is_finished(self, response: LLMResult) -> bool:
+        """Check if the LLM generation completed successfully."""
+        if response.llm_output and "llama_stack_responses" in response.llm_output:
+            return all(
+                r.get("stop_reason") not in (None, "out_of_tokens")
+                for r in response.llm_output["llama_stack_responses"]
+            )
+        return bool(
+            response.generations
+            and response.generations[0]
+            and any(g.text for g in response.generations[0])
+        )
+
     def get_temperature(self, n: int) -> float:
         """Get temperature based on number of completions."""
         return 0.3 if n > 1 else 1e-8
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -103,13 +103,13 @@ def register_benchmarks(
     client.alpha.benchmarks.register(
         benchmark_id=inline_benchmark_id,
         dataset_id=dataset_id,
-        scoring_functions=["semantic_similarity"],
+        scoring_functions=["answer_similarity", "nv_accuracy"],
         provider_id="trustyai_ragas_inline",
     )
     client.alpha.benchmarks.register(
         benchmark_id=remote_benchmark_id,
         dataset_id=dataset_id,
-        scoring_functions=["semantic_similarity"],
+        scoring_functions=["answer_similarity", "nv_accuracy"],
         provider_id="trustyai_ragas_remote",
     )
     yield
diff --git a/tests/test_base.py b/tests/test_base.py
diff --git a/tests/test_inline_evaluation.py b/tests/test_inline_evaluation.py
diff --git a/tests/test_remote_wrappers.py b/tests/test_remote_wrappers.py
diff --git a/uv.lock b/uv.lock