diff --git a/docs/evaluator/index.mdx b/docs/evaluator/index.mdx index b1d8f5f6cc..376fc17a26 100644 --- a/docs/evaluator/index.mdx +++ b/docs/evaluator/index.mdx @@ -59,7 +59,6 @@ Submit your evaluation to the Evaluator service using the NeMo Platform SDK: ```python from nemo_evaluator.sdk import Evaluator -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager from nemo_platform import NeMoPlatform @@ -74,7 +73,6 @@ job = evaluator.submit( metric=metric, dataset=dataset, config=config, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() diff --git a/docs/evaluator/metrics/agent-configuration.mdx b/docs/evaluator/metrics/agent-configuration.mdx index ca855031c9..012bc52d59 100644 --- a/docs/evaluator/metrics/agent-configuration.mdx +++ b/docs/evaluator/metrics/agent-configuration.mdx @@ -164,7 +164,6 @@ from nemo_evaluator_sdk import Agent, RunConfigOnline from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected_answer}}") agent = Agent( url="https://my-nat-agent.example.com", @@ -185,7 +184,6 @@ job = evaluator.submit( {"role": "user", "content": "{{item.question}}"}, ], }, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() diff --git a/docs/evaluator/metrics/agentic.mdx b/docs/evaluator/metrics/agentic.mdx index d45e4c0a05..269c1caea6 100644 --- a/docs/evaluator/metrics/agentic.mdx +++ b/docs/evaluator/metrics/agentic.mdx @@ -228,7 +228,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig from nemo_evaluator_sdk.metrics.ragas import ToolCallAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ToolCallAccuracyMetric() job = evaluator.submit( @@ -251,7 +250,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -430,7 +428,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, ToolCallingMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ToolCallingMetric(reference="{{item.tool_calls}}") @@ -465,7 +462,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -572,7 +568,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, Model from nemo_evaluator_sdk.metrics.ragas import TopicAdherenceMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -596,7 +591,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -768,7 +762,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, Model from nemo_evaluator_sdk.metrics.ragas import AgentGoalAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -798,7 +791,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -925,7 +917,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, Model from nemo_evaluator_sdk.metrics.ragas import AgentGoalAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -963,7 +954,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -1023,7 +1013,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, Model from nemo_evaluator_sdk.metrics.ragas import AnswerAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -1042,7 +1031,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -1055,7 +1043,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfigOnlineModel, InferenceParams, Model from nemo_evaluator_sdk.metrics.ragas import AnswerAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -1090,7 +1077,6 @@ job = evaluator.submit( } ] }, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() diff --git a/docs/evaluator/metrics/job-management.mdx b/docs/evaluator/metrics/job-management.mdx index 20391c2aef..65fa3fa576 100644 --- a/docs/evaluator/metrics/job-management.mdx +++ b/docs/evaluator/metrics/job-management.mdx @@ -21,7 +21,6 @@ from nemo_evaluator.sdk import Evaluator from nemo_platform import NeMoPlatform from nemo_evaluator_sdk import RunConfig from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager sdk = NeMoPlatform( base_url=os.environ.get("NMP_BASE_URL", "http://localhost:8080"), @@ -38,7 +37,6 @@ job = evaluator.submit( {"expected": "Berlin", "output": "Munich"}, ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print("Submitted job:", job.name) diff --git a/docs/evaluator/metrics/llm-as-a-judge.mdx b/docs/evaluator/metrics/llm-as-a-judge.mdx index 935ec30bd7..abb357affc 100644 --- a/docs/evaluator/metrics/llm-as-a-judge.mdx +++ b/docs/evaluator/metrics/llm-as-a-judge.mdx @@ -298,7 +298,6 @@ For production workloads, submit the same metric and dataset as a durable platfo ```python from nemo_evaluator_sdk import RunConfig, JSONScoreParser, Model, RubricScore, LLMJudgeMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = LLMJudgeMetric( model=Model( @@ -347,7 +346,6 @@ job = evaluator.submit( {"input": "What is 2 + 2?", "output": "4"}, ], config=RunConfig(parallelism=8, limit_samples=100), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print("Submitted job:", job.name) @@ -436,13 +434,13 @@ By default, the JSON parser is used for range and rubric scores, with the score ```python # JSON parser (default) -"parser": {"type": "json", "json_path": "quality"} +parser = {"type": "json", "json_path": "quality"} # Regex parser (for models that do not support structured output) -"parser": {"type": "regex", "pattern": "QUALITY: (\\w+)"} +parser = {"type": "regex", "pattern": "QUALITY: (\\w+)"} # Regex parser with method='search' (finds pattern anywhere in text) -"parser": {"type": "regex", "pattern": "SCORE: (\\d+)", "method": "search"} +parser = {"type": "regex", "pattern": "SCORE: (\\d+)", "method": "search"} ``` @@ -642,15 +640,13 @@ metric = { Control judge model behavior with inference parameters: ```python - -"prompt_template": { +prompt_template = { "messages": [...], - "temperature": 0.1, # Lower for more consistent scoring - "max_tokens": 1024, # Increase if judge needs more space - "timeout": 30, # Request timeout in seconds - "stop": ["<{{ end_of_text }}>"] # Stop sequences + "temperature": 0.1, # Lower for more consistent scoring + "max_tokens": 1024, # Increase if judge needs more space + "timeout": 30, # Request timeout in seconds + "stop": [""], # Stop sequences } - ``` diff --git a/docs/evaluator/metrics/manage-metrics.mdx b/docs/evaluator/metrics/manage-metrics.mdx index 79d051fa10..0784c869c5 100644 --- a/docs/evaluator/metrics/manage-metrics.mdx +++ b/docs/evaluator/metrics/manage-metrics.mdx @@ -98,7 +98,6 @@ For online evaluations, provide a model or agent target and use the online param ```python from nemo_evaluator_sdk import RunConfig, ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") @@ -109,7 +108,6 @@ job = evaluator.submit( {"expected": "Berlin", "output": "Munich"}, ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() diff --git a/docs/evaluator/metrics/model-configuration.mdx b/docs/evaluator/metrics/model-configuration.mdx index 404378ac3d..537a41e5f0 100644 --- a/docs/evaluator/metrics/model-configuration.mdx +++ b/docs/evaluator/metrics/model-configuration.mdx @@ -208,14 +208,12 @@ Durable remote `evaluator.submit(...)` jobs additionally accept a `ModelRef` tar ```python from nemo_evaluator_sdk import ModelRef, RunConfigOnlineModel -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager job = evaluator.submit( metric=metric, dataset=dataset, config=RunConfigOnlineModel(), target=ModelRef(root="default/my-model"), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) ``` diff --git a/docs/evaluator/metrics/rag.mdx b/docs/evaluator/metrics/rag.mdx index 68bcd58a5f..5502ca3a81 100644 --- a/docs/evaluator/metrics/rag.mdx +++ b/docs/evaluator/metrics/rag.mdx @@ -61,13 +61,7 @@ evaluator: Evaluator = client.evaluator # this object is an Evaluator resource Use `evaluator.run(metric=metric, dataset=dataset)` for a local synchronous evaluation. Use `evaluator.submit(metric=metric, dataset=dataset)` when you need a durable remote job: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - -job = evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), -) +job = evaluator.submit(metric=metric, dataset=dataset) job.wait_until_done() result = job.get_result() ``` @@ -226,15 +220,12 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ContextRecallMetric(judge_model=judge_model) job = evaluator.submit( metric=metric, dataset=offline_rows, config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -305,15 +296,12 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ContextPrecisionMetric(judge_model=judge_model) job = evaluator.submit( metric=metric, dataset=offline_rows, config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -387,8 +375,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ContextRelevanceMetric(judge_model=judge_model) job = evaluator.submit( @@ -400,7 +386,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -455,8 +440,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ContextEntityRecallMetric(judge_model=judge_model) job = evaluator.submit( @@ -468,7 +451,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -533,8 +515,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = FaithfulnessMetric(judge_model=judge_model) job = evaluator.submit( @@ -543,7 +523,6 @@ job = evaluator.submit( config=online_config, target=generation_model, prompt_template=online_prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -607,8 +586,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ResponseGroundednessMetric(judge_model=judge_model) job = evaluator.submit( @@ -617,7 +594,6 @@ job = evaluator.submit( config=online_config, target=generation_model, prompt_template=online_prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -680,8 +656,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = NoiseSensitivityMetric(judge_model=judge_model) job = evaluator.submit( @@ -698,7 +672,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -777,8 +750,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ResponseRelevancyMetric( judge_model=judge_model, embeddings_model=embeddings_model, @@ -791,7 +762,6 @@ job = evaluator.submit( config=online_config, target=generation_model, prompt_template=online_prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -915,13 +885,7 @@ judge_model = Model( For durable remote execution, submit the same metric and dataset that you tested locally: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - -job = evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), -) +job = evaluator.submit(metric=metric, dataset=dataset) job.wait_until_done() artifacts_dir = job.download_artifacts(path="evaluation_artifacts") print(f"Saved artifacts under {artifacts_dir}") diff --git a/docs/evaluator/metrics/remote.mdx b/docs/evaluator/metrics/remote.mdx index b14f66bdc1..8886217f4d 100644 --- a/docs/evaluator/metrics/remote.mdx +++ b/docs/evaluator/metrics/remote.mdx @@ -155,7 +155,6 @@ For production workloads, submit the same metric and dataset as a durable platfo ```python from nemo_evaluator_sdk import RunConfig, JSONScoreParser, RemoteScore, RemoteMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = RemoteMetric( url="https://my-evaluation-server.test/evaluate", @@ -183,7 +182,6 @@ job = evaluator.submit( {"reference": "2", "output": "2"}, ], config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print("Submitted job:", job.name) @@ -199,7 +197,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import RunConfig, NemoAgentToolkitRemoteMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = NemoAgentToolkitRemoteMetric( url="http://localhost:8001/evaluate_item", @@ -223,7 +220,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() diff --git a/docs/evaluator/metrics/results.mdx b/docs/evaluator/metrics/results.mdx index 41bc7d82e6..07a1b333ea 100644 --- a/docs/evaluator/metrics/results.mdx +++ b/docs/evaluator/metrics/results.mdx @@ -47,7 +47,6 @@ result = evaluator.run( ```python from nemo_evaluator_sdk import RunConfig, ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") @@ -58,7 +57,6 @@ job = evaluator.submit( {"expected": "Berlin", "output": "Munich"}, ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() ``` diff --git a/docs/evaluator/metrics/similarity.mdx b/docs/evaluator/metrics/similarity.mdx index 3be35bf736..75138076e1 100644 --- a/docs/evaluator/metrics/similarity.mdx +++ b/docs/evaluator/metrics/similarity.mdx @@ -31,13 +31,7 @@ evaluator: Evaluator = sdk.evaluator # this object is an Evaluator resource Use `evaluator.run(metric=metric, dataset=dataset)` for a local synchronous evaluation. Use `evaluator.submit(metric=metric, dataset=dataset)` when you need a durable remote job: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - -job = evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), -) +job = evaluator.submit(metric=metric, dataset=dataset) job.wait_until_done() result = job.get_result() ``` @@ -111,7 +105,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import BLEUMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = BLEUMetric( references=["{{item.reference_1}}", "{{item.reference_2}}"], @@ -133,7 +126,6 @@ job = evaluator.submit( "model_output": "Hello world!", }, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -206,7 +198,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric( reference="{{item.correct_answer | lower | trim}}", @@ -220,7 +211,6 @@ job = evaluator.submit( {"correct_answer": "London", "model_answer": "london "}, {"correct_answer": "Berlin", "model_answer": "Munich"}, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -289,7 +279,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import F1Metric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = F1Metric( reference="{{item.reference}}", @@ -305,7 +294,6 @@ job = evaluator.submit( }, {"reference": "a red apple", "answer": "red apple"}, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -382,7 +370,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import NumberCheckMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = NumberCheckMetric( operation=">", @@ -399,7 +386,6 @@ job = evaluator.submit( {"predicted": "0.5"}, {"predicted": "0.1"}, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -471,7 +457,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import ROUGEMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ROUGEMetric( reference="{{item.reference_summary}}", @@ -490,7 +475,6 @@ job = evaluator.submit( "model_summary": "High winds delayed the launch.", }, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -579,7 +563,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import StringCheckMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = StringCheckMetric( operation="startswith", @@ -595,7 +578,6 @@ job = evaluator.submit( {"output": "Answer: Success"}, {"output": "Error occurred"}, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() diff --git a/docs/evaluator/sdk-resources.mdx b/docs/evaluator/sdk-resources.mdx index b1ed7d36e6..ca0ee6d708 100644 --- a/docs/evaluator/sdk-resources.mdx +++ b/docs/evaluator/sdk-resources.mdx @@ -88,7 +88,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") @@ -97,11 +96,7 @@ dataset = [ {"expected": "Berlin", "output": "Munich"}, ] -job = evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), -) +job = evaluator.submit(metric=metric, dataset=dataset) job.wait_until_done() result = job.get_result() print(result.aggregate_scores) @@ -143,7 +138,6 @@ evaluator: AsyncEvaluator = client.evaluator import asyncio from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") @@ -154,11 +148,7 @@ dataset = [ async def main() -> None: - job = await evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), - ) + job = await evaluator.submit(metric=metric, dataset=dataset) await job.wait_until_done() result = await job.get_result() print(result.aggregate_scores) diff --git a/docs/evaluator/test_doc_examples.py b/docs/evaluator/test_doc_examples.py index 2d9f337a1f..6126ee3601 100644 --- a/docs/evaluator/test_doc_examples.py +++ b/docs/evaluator/test_doc_examples.py @@ -10,9 +10,10 @@ import paths and call contract that every runnable doc snippet relies on, so the docs cannot silently drift from the SDK again. -These checks run fully offline: they exercise import locations and the -client-side argument validation in ``Evaluator.submit`` / ``AsyncEvaluator.submit``. -They do not submit jobs and need no running platform or model credentials. +These checks run fully offline: they exercise import locations and the packager +contract for ``Evaluator.submit`` — built-in metrics bundle inline and need no +packager, while custom metrics require an explicit one. They do not submit jobs +and need no running platform or model credentials. Run directly: uv run python docs/evaluator/test_doc_examples.py @@ -27,9 +28,26 @@ import pytest from nemo_evaluator.sdk import Evaluator +from nemo_evaluator.shared.metric_bundles.bundles import MetricBundlePackagerPolicyError +from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult from nemo_platform import NeMoPlatform +class _CustomMetric: + """A metric that is not a built-in type (cannot be reconstructed from config).""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + def test_filesetref_imports_from_platform_sdk() -> None: """Docs import ``FilesetRef`` from ``nemo_evaluator.sdk`` (platform helpers).""" from nemo_evaluator.sdk import FilesetRef @@ -84,16 +102,35 @@ def test_packager_param_is_submit_only() -> None: assert "metric_bundle_packager" not in run_params -def test_submit_requires_metric_bundle_packager() -> None: - """``submit()`` without a packager raises the documented ValueError, offline.""" +def test_builtin_submit_does_not_require_a_packager() -> None: + """Built-in metrics bundle inline, so docs omit the packager on ``submit()``. + + Packager resolution happens before delegating to the executor, so we stub the + executor with a sentinel: reaching it (rather than raising a packager-policy + error) proves the built-in metric bundled inline with no packager required — + without depending on a live service or swallowing unrelated failures. + """ + from unittest.mock import patch + from nemo_evaluator_sdk import ExactMatchMetric evaluator = _evaluator() metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") dataset = [{"expected": "Paris", "output": "Paris"}] + sentinel = RuntimeError("reached executor.submit (packaging resolved without a packager)") + + with patch.object(evaluator._executor, "submit", side_effect=sentinel): + with pytest.raises(RuntimeError, match="reached executor.submit"): + evaluator.submit(metric=metric, dataset=dataset) + + +def test_custom_submit_requires_an_explicit_packager() -> None: + """Custom (non-built-in) metrics still require an explicit packager for durable submit.""" + evaluator = _evaluator() + dataset = [{"expected": "Paris", "output": "Paris"}] - with pytest.raises(ValueError, match="metric_bundle_packager is required"): - evaluator.submit(metric=metric, dataset=dataset) + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): + evaluator.submit(metric=_CustomMetric(), dataset=dataset) def test_run_does_not_require_metric_bundle_packager() -> None: diff --git a/docs/evaluator/tutorials/run-llm-judge-evaluation.mdx b/docs/evaluator/tutorials/run-llm-judge-evaluation.mdx index f66164f258..22e1aed14e 100644 --- a/docs/evaluator/tutorials/run-llm-judge-evaluation.mdx +++ b/docs/evaluator/tutorials/run-llm-judge-evaluation.mdx @@ -364,8 +364,6 @@ The first response is comprehensive and helpful, while the second is unhelpfully Now let's evaluate a larger sample and compare the judge's predictions against human annotations. This tells us how well our judge aligns with human judgment. ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - sample_config = RunConfig( parallelism=1, limit_samples=5, @@ -396,7 +394,6 @@ job_v1 = evaluator.submit( metric=metric_v1_remote, dataset=dataset_ref, config=sample_config, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print(f"Job submitted: {job_v1.name}") ``` @@ -510,7 +507,6 @@ job_v2 = evaluator.submit( metric=metric_v2_remote, dataset=dataset_ref, config=sample_config, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print(f"Job submitted: {job_v2.name}") diff --git a/plugins/nemo-evaluator/openapi/openapi.yaml b/plugins/nemo-evaluator/openapi/openapi.yaml index 4b1f07209e..aa28f86ee9 100644 --- a/plugins/nemo-evaluator/openapi/openapi.yaml +++ b/plugins/nemo-evaluator/openapi/openapi.yaml @@ -1118,6 +1118,40 @@ components: description: Parameters for model inference. Extra fields can be supplied for additional options applied to the inference request directly. Fields not supported by the model may cause inference errors during evaluation. + InlineMetricPayload: + properties: + kind: + type: string + const: inline + title: Kind + description: Payload format discriminator. + metric: + additionalProperties: true + type: object + title: Metric + description: JSON-serialized built-in metric configuration, discriminated + by its own `type`. + digest: + title: Digest + description: SHA-256 digest of the canonical metric JSON. Informational; + recomputed server-side. + type: string + additionalProperties: false + type: object + required: + - kind + - metric + title: InlineMetricPayload + description: 'Wire schema for an inline (config-serialized) metric payload. + + + Mirrors the runtime ``InlineMetricPayload``. The metric is stored as its own + + JSON configuration and reconstructed from the metric type union at execution, + + so no code is shipped or executed on load. Used for platform-recognized + + built-in metric types.' Metric: properties: id: @@ -1272,12 +1306,14 @@ components: payload: oneOf: - $ref: '#/components/schemas/CloudpickleMetricPayload' + - $ref: '#/components/schemas/InlineMetricPayload' title: Payload description: Format-specific serialized metric. discriminator: propertyName: kind mapping: cloudpickle: '#/components/schemas/CloudpickleMetricPayload' + inline: '#/components/schemas/InlineMetricPayload' additionalProperties: false type: object required: diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py b/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py index 927a19224b..b90ee9097e 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py @@ -7,7 +7,7 @@ from datetime import datetime from enum import StrEnum -from typing import Annotated, Literal +from typing import Annotated, Any, Literal from nemo_evaluator.shared.metric_bundles.bundles import ( BundledMetricOutputSpec, @@ -15,7 +15,7 @@ ) from nemo_evaluator_sdk.values.common import SecretRef from nemo_platform_plugin.schema import DatetimeFilter, Filter -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, field_validator class CloudpickleMetricPayload(BaseModel): @@ -40,9 +40,44 @@ class CloudpickleMetricPayload(BaseModel): ) +class InlineMetricPayload(BaseModel): + """Wire schema for an inline (config-serialized) metric payload. + + Mirrors the runtime ``InlineMetricPayload``. The metric is stored as its own + JSON configuration and reconstructed from the metric type union at execution, + so no code is shipped or executed on load. Used for platform-recognized + built-in metric types. + """ + + model_config = ConfigDict(extra="forbid") + + kind: Literal["inline"] = Field(description="Payload format discriminator.") + metric: dict[str, Any] = Field( + description="JSON-serialized built-in metric configuration, discriminated by its own `type`." + ) + digest: str | None = Field( + default=None, + description="SHA-256 digest of the canonical metric JSON. Informational; recomputed server-side.", + ) + + @field_validator("metric") + @classmethod + def _metric_must_declare_type(cls, value: dict[str, Any]) -> dict[str, Any]: + """Reject payloads without a metric ``type`` discriminator at the API boundary. + + The metric body stays an open object (the concrete shape is validated when + the bundle is hydrated against the metric type union), but a non-empty + ``type`` is required so malformed payloads fail fast rather than at execution. + """ + metric_type = value.get("type") + if not isinstance(metric_type, str) or not metric_type: + raise ValueError("inline metric payload must include a non-empty 'type'") + return value + + # Discriminated on ``kind`` so additional payload formats can join the union -# without changing the field type. Cloudpickle is the only kind today. -MetricPayload = Annotated[CloudpickleMetricPayload, Field(discriminator="kind")] +# without changing the field type. +MetricPayload = Annotated[CloudpickleMetricPayload | InlineMetricPayload, Field(discriminator="kind")] class MetricInline(BaseModel): diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py index 86a4842d1e..ed4c92441f 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py @@ -11,6 +11,10 @@ from pathlib import Path from typing import Annotated, Any, ClassVar, Self, TypeAlias +# Imported for their registration side effects: each module registers its +# payload kind in the bundle registry so MetricBundle payloads validate. +import nemo_evaluator.shared.metric_bundles.cloudpickle # noqa: F401 +import nemo_evaluator.shared.metric_bundles.inline # noqa: F401 from nemo_evaluator.api.schemas import MetricInline from nemo_evaluator.filesets import FilesetRef, download_dataset, download_dataset_sync from nemo_evaluator.metric_refs import MetricRef, resolve_metric_specs @@ -21,7 +25,6 @@ metric_bundle_packager_for_payload, unbundle_metric, ) -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricPayload # noqa: F401 from nemo_evaluator_sdk import Evaluator from nemo_evaluator_sdk.execution.config import resolve_params from nemo_evaluator_sdk.execution.metric_execution import run_sync diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/metric_storage.py b/plugins/nemo-evaluator/src/nemo_evaluator/metric_storage.py index b15baddf7d..5cbebd5432 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/metric_storage.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/metric_storage.py @@ -16,9 +16,10 @@ import logging import uuid -# Importing the cloudpickle module registers the "cloudpickle" payload kind in -# the bundle registry so MetricBundle payloads round-trip through validation. +# Importing the payload modules registers their bundle payload kinds in the +# bundle registry so MetricBundle payloads round-trip through validation. import nemo_evaluator.shared.metric_bundles.cloudpickle # noqa: F401 +import nemo_evaluator.shared.metric_bundles.inline # noqa: F401 from nemo_evaluator.shared.metric_bundles.bundles import MetricBundle from nemo_platform import AsyncNeMoPlatform from pydantic import ValidationError diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/_executor.py b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/_executor.py index c5da61daa4..e6b48a10d5 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/_executor.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/_executor.py @@ -23,8 +23,13 @@ ) from nemo_evaluator.sdk.types import PluginDatasetInput from nemo_evaluator.sdk.utils import filter_benchmark_result, filter_evaluation_result -from nemo_evaluator.shared.metric_bundles.bundles import MetricBundle, MetricBundlePackager, bundle_metric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundle, + MetricBundlePackager, + MetricBundlePackagerPolicyError, + bundle_metric, +) +from nemo_evaluator.shared.metric_bundles.defaults import resolve_default_metric_bundle_packager from nemo_evaluator_sdk.datasets.loader import prepare_dataset_rows from nemo_evaluator_sdk.execution.config import resolve_params from nemo_evaluator_sdk.execution.metric_execution import run_sync @@ -52,10 +57,6 @@ SubmitTargetSpec = TargetSpec | ModelRef -class MetricBundlePackagerPolicyError(RuntimeError): - """Raised when plugin backend metric packaging is not configured.""" - - def _require_metric_bundle_packager(metric_bundle_packager: MetricBundlePackager | None) -> MetricBundlePackager: if metric_bundle_packager is None: raise MetricBundlePackagerPolicyError( @@ -307,7 +308,9 @@ def evaluate( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), + metric_bundle_packager=resolve_default_metric_bundle_packager( + metric, None, allow_cloudpickle_fallback=True, action="Running" + ), ) payload = self.run_local( spec=spec, @@ -367,7 +370,9 @@ def evaluate_benchmark( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), + metric_bundle_packager=resolve_default_metric_bundle_packager( + metrics, None, allow_cloudpickle_fallback=True, action="Running" + ), ) payload = self.run_local( spec=spec, @@ -538,7 +543,9 @@ async def evaluate( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), + metric_bundle_packager=resolve_default_metric_bundle_packager( + metric, None, allow_cloudpickle_fallback=True, action="Running" + ), ) payload = await self.run_local( spec=spec, @@ -569,7 +576,9 @@ async def evaluate_benchmark( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), + metric_bundle_packager=resolve_default_metric_bundle_packager( + metrics, None, allow_cloudpickle_fallback=True, action="Running" + ), ) payload = await self.run_local( spec=spec, diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/metric_resources.py b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/metric_resources.py index 4def0cf79e..7854e468c4 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/metric_resources.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/metric_resources.py @@ -20,6 +20,7 @@ MetricBundlePackager, bundle_metric, ) +from nemo_evaluator.shared.metric_bundles.defaults import resolve_default_metric_bundle_packager from nemo_evaluator_sdk.metrics.protocol import Metric as RuntimeMetric from nemo_platform import AsyncNeMoPlatform, NeMoPlatform from nemo_platform_plugin.schema import Page @@ -32,13 +33,11 @@ def _metric_inline( """Package a runtime metric (or accept a pre-built bundle) as the wire DTO.""" if isinstance(metric, MetricBundle): bundle = metric - elif metric_bundle_packager is None: - raise ValueError( - "metric_bundle_packager is required when storing a runtime metric; " - "pass CloudpickleMetricBundlePackager(), or pass a pre-built MetricBundle." - ) else: - bundle = bundle_metric(metric, metric_bundle_packager) + packager = resolve_default_metric_bundle_packager( + metric, metric_bundle_packager, allow_cloudpickle_fallback=False, action="Storing" + ) + bundle = bundle_metric(metric, packager) # JSON round-trip keeps the base64 payload encoding consistent with the runtime model. return MetricInline.model_validate_json(bundle.model_dump_json()) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/resources.py b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/resources.py index 2f210c4b90..fd4528071d 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/resources.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/resources.py @@ -30,6 +30,7 @@ RunConfigOnlineModel, ) from nemo_evaluator.shared.metric_bundles.bundles import MetricBundlePackager +from nemo_evaluator.shared.metric_bundles.defaults import resolve_default_metric_bundle_packager from nemo_evaluator_sdk.metrics.protocol import Metric from nemo_evaluator_sdk.values import ( Agent, @@ -135,11 +136,6 @@ def submit( metric_bundle_packager: MetricBundlePackager | None = None, ) -> EvaluatorJobResource: """Submit a metric job through the evaluator plugin executor.""" - if metric_bundle_packager is None: - raise ValueError( - "metric_bundle_packager is required for submit(); " - "pass CloudpickleMetricBundlePackager() to enable metric bundling." - ) return self._executor.submit( metric=metric, dataset=dataset, @@ -147,7 +143,9 @@ def submit( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=metric_bundle_packager, + metric_bundle_packager=resolve_default_metric_bundle_packager( + metric, metric_bundle_packager, allow_cloudpickle_fallback=False, action="Submitting" + ), ) @overload @@ -365,11 +363,6 @@ async def submit( metric_bundle_packager: MetricBundlePackager | None = None, ) -> AsyncEvaluatorJobResource: """Submit a metric job through the evaluator plugin executor.""" - if metric_bundle_packager is None: - raise ValueError( - "metric_bundle_packager is required for submit(); " - "pass CloudpickleMetricBundlePackager() to enable metric bundling." - ) return await self._executor.submit( metric=metric, dataset=dataset, @@ -377,7 +370,9 @@ async def submit( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=metric_bundle_packager, + metric_bundle_packager=resolve_default_metric_bundle_packager( + metric, metric_bundle_packager, allow_cloudpickle_fallback=False, action="Submitting" + ), ) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/bundles.py b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/bundles.py index bef57ebe50..6d99dd6d73 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/bundles.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/bundles.py @@ -34,6 +34,10 @@ class MetricBundlingError(ValueError): """Raised when a metric cannot be bundled or hydrated.""" +class MetricBundlePackagerPolicyError(RuntimeError): + """Raised when metric packaging is not configured for an operation.""" + + class MetricMetadata(BaseModel): """User-facing metadata captured with a bundled metric.""" diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/defaults.py b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/defaults.py new file mode 100644 index 0000000000..49f7eb2a02 --- /dev/null +++ b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/defaults.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Default metric bundle packager selection. + +Encapsulates the policy for choosing a packager when the caller does not provide +one explicitly. Built-in metric types use the inline packager (config-serialized, +no code execution). Custom metrics fall back to cloudpickle for local execution, +or require an explicit cloudpickle opt-in for operations that ship the metric to +the service. +""" + +from __future__ import annotations + +from collections.abc import Sequence + +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundlePackager, + MetricBundlePackagerPolicyError, +) +from nemo_evaluator.shared.metric_bundles.hybrid import HybridMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager, inline_bundle_supported +from nemo_evaluator_sdk.metrics.protocol import Metric + + +def resolve_default_metric_bundle_packager( + metric: Metric | Sequence[Metric], + explicit: MetricBundlePackager | None, + *, + allow_cloudpickle_fallback: bool, + action: str, +) -> MetricBundlePackager: + """Resolve the packager to use for one or more metrics. + + An explicit packager is always honored. Otherwise the inline packager is used + when every metric is a built-in type. When a custom metric is present, local + execution (``allow_cloudpickle_fallback=True``) uses the hybrid packager so + built-in metrics still bundle inline and only the custom metric is + cloudpickled; operations that ship the metric to the service require an + explicit opt-in instead. + + Args: + metric: A runtime metric, or a sequence of them (one packager applies to all). + explicit: A caller-provided packager, if any. + allow_cloudpickle_fallback: Whether custom metrics may default to cloudpickle. + action: Verb describing the operation, used in the error message. + + Returns: + The packager to bundle the metric(s) with. + + Raises: + MetricBundlePackagerPolicyError: When a custom metric needs an explicit + packager and no fallback is allowed. + """ + if explicit is not None: + return explicit + metrics = metric if isinstance(metric, Sequence) and not isinstance(metric, (str, bytes)) else [metric] + if all(inline_bundle_supported(item) for item in metrics): + return InlineMetricBundlePackager() + if allow_cloudpickle_fallback: + return HybridMetricBundlePackager() + raise MetricBundlePackagerPolicyError( + f"{action} a custom metric requires an explicit metric_bundle_packager; " + "pass HybridMetricBundlePackager() (recommended — built-in metrics stay inline) " + "or CloudpickleMetricBundlePackager() to bundle the metric code." + ) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/hybrid.py b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/hybrid.py new file mode 100644 index 0000000000..0276c624c3 --- /dev/null +++ b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/hybrid.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Hybrid metric bundle packager. + +Packages each metric with the lightest representation that supports it: built-in +metric types are bundled inline (config-serialized, reconstructed from the metric +type union), and only metrics that cannot be reconstructed from configuration are +cloudpickled. This minimizes cloudpickle usage so built-in metrics avoid the +Python-version coupling of cloudpickle payloads — relevant when a remote service +runs a different interpreter than the submitter. +""" + +from __future__ import annotations + +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundlePackager, + MetricBundlePayload, + metric_bundle_packager_for_payload, +) +from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager, inline_bundle_supported +from nemo_evaluator_sdk.metrics.protocol import Metric + + +class HybridMetricBundlePackager(MetricBundlePackager): + """Inline built-in metrics; cloudpickle only metrics that require it. + + Applied per metric, so a mixed set bundles each metric independently: inline + where the metric type is reconstructable, cloudpickle otherwise. Loading is + dispatched by the stored payload kind, so hydration works regardless of which + representation each metric used. + """ + + def __init__(self) -> None: + """Build the delegate inline and cloudpickle packagers.""" + self._inline = InlineMetricBundlePackager() + self._cloudpickle = CloudpickleMetricBundlePackager() + + def package(self, metric: Metric) -> MetricBundlePayload: + """Inline the metric when its type is reconstructable; cloudpickle otherwise.""" + if inline_bundle_supported(metric): + return self._inline.package(metric) + return self._cloudpickle.package(metric) + + def load(self, payload: MetricBundlePayload) -> Metric: + """Hydrate by dispatching to the packager registered for the payload kind.""" + return metric_bundle_packager_for_payload(payload).load(payload) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/inline.py b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/inline.py new file mode 100644 index 0000000000..af395321c6 --- /dev/null +++ b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/inline.py @@ -0,0 +1,107 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Inline metric bundle implementation. + +The inline packager stores a built-in metric as its own JSON configuration +instead of a serialized code blob. The runtime reconstructs the metric from the +``MetricsUnion`` discriminated union (keyed on the metric ``type``), so no +arbitrary code is executed on load. This is the default bundler for metric types +the platform already recognizes; custom metric classes that are not part of +``MetricsUnion`` cannot be reconstructed from config and require the +``CloudpickleMetricBundlePackager`` instead. +""" + +from __future__ import annotations + +import hashlib +import json +from typing import Any, Literal, get_args + +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundlePackager, + MetricBundlePayload, + MetricBundlingError, + register_metric_bundle_kind, +) +from nemo_evaluator_sdk.metrics.protocol import Metric +from nemo_evaluator_sdk.metrics.types import MetricsUnion, MetricVariants +from pydantic import ConfigDict, TypeAdapter, computed_field, field_validator + +# Discriminated union (keyed on ``type``) used to serialize and reconstruct the +# concrete built-in metric. Reconstruction is pure data validation — no code is +# executed — so inline bundles are safe to hydrate. +_METRIC_ADAPTER: TypeAdapter[Any] = TypeAdapter(MetricsUnion) + +# Concrete metric classes that participate in ``MetricsUnion``. A metric must be +# an instance of one of these to be inline-bundleable. +_INLINE_SUPPORTED_TYPES: tuple[type, ...] = tuple(get_args(MetricVariants)) + + +def inline_bundle_supported(metric: object) -> bool: + """Return whether a metric can be bundled inline (reconstructed from config).""" + return isinstance(metric, _INLINE_SUPPORTED_TYPES) + + +class InlineMetricPayload(MetricBundlePayload): + """Payload storing a built-in metric as its JSON configuration.""" + + model_config = ConfigDict(extra="ignore") + + metric: dict[str, Any] + + @field_validator("metric") + @classmethod + def _metric_must_declare_type(cls, value: dict[str, Any]) -> dict[str, Any]: + metric_type = value.get("type") + if not isinstance(metric_type, str) or not metric_type: + raise MetricBundlingError("inline metric payload must include a non-empty 'type'") + return value + + @property + def kind(self) -> Literal["inline"]: + """Payload discriminator used by the metric bundle registry.""" + return "inline" + + @computed_field + @property + def digest(self) -> str: + """Digest of the canonical serialized metric configuration.""" + canonical = json.dumps(self.metric, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() + + +class InlineMetricBundlePackager(MetricBundlePackager): + """Inline metric bundle packager. + + Serializes a built-in metric as JSON and reconstructs it from the metric + type union on load. No arbitrary code is executed when hydrating, so this is + the preferred default for platform-recognized metric types. + """ + + def package(self, metric: Metric) -> MetricBundlePayload: + """Package a built-in metric object as its JSON configuration.""" + if not isinstance(metric, Metric): + raise MetricBundlingError("object does not satisfy the Metric protocol") + if not inline_bundle_supported(metric): + raise MetricBundlingError( + "inline metric bundling supports only built-in metric types; " + "pass CloudpickleMetricBundlePackager() to bundle a custom metric." + ) + data: dict[str, Any] = _METRIC_ADAPTER.dump_python(metric, mode="json") + return InlineMetricPayload(metric=data) + + def load(self, payload: MetricBundlePayload) -> Metric: + """Hydrate a metric from an inline payload by validating its configuration.""" + inline_payload = InlineMetricPayload.model_validate(payload.model_dump(mode="python")) + hydrated_metric = _METRIC_ADAPTER.validate_python(inline_payload.metric) + if not isinstance(hydrated_metric, Metric): + raise MetricBundlingError("unbundled object does not satisfy the Metric protocol") + return hydrated_metric + + +register_metric_bundle_kind( + "inline", + payload_type=InlineMetricPayload, + packager_factory=InlineMetricBundlePackager, +) diff --git a/plugins/nemo-evaluator/tests/sdk/test_metric_sdk_resources.py b/plugins/nemo-evaluator/tests/sdk/test_metric_sdk_resources.py index 9e3c764d55..75115e0934 100644 --- a/plugins/nemo-evaluator/tests/sdk/test_metric_sdk_resources.py +++ b/plugins/nemo-evaluator/tests/sdk/test_metric_sdk_resources.py @@ -6,7 +6,7 @@ from __future__ import annotations from datetime import datetime, timezone -from typing import Any +from typing import Any, cast from unittest.mock import AsyncMock, MagicMock import pytest @@ -15,9 +15,30 @@ AsyncEvaluatorMetricsResource, EvaluatorMetricsResource, ) -from nemo_evaluator.shared.metric_bundles.bundles import MetricBundle, bundle_metric +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundle, + MetricBundlePackagerPolicyError, + bundle_metric, +) from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.protocol import Metric as RuntimeMetric +from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class _CustomRuntimeMetric: + """A protocol-satisfying metric that is not inline-bundleable.""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) def _bundle() -> MetricBundle: @@ -81,12 +102,24 @@ def test_sync_create_posts_bundle_and_returns_metric() -> None: assert body["payload"]["kind"] == "cloudpickle" -def test_sync_create_requires_packager_for_runtime_metric() -> None: - resource = EvaluatorMetricsResource(_platform(MagicMock())) +def test_sync_create_defaults_to_inline_for_builtin_metric() -> None: + bundle = _bundle() + http_client = MagicMock() + http_client.post.return_value = _response(_metric_response("exact", bundle)) + resource = EvaluatorMetricsResource(_platform(http_client)) metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") - with pytest.raises(ValueError, match="metric_bundle_packager is required"): - resource.create("exact", metric=metric) + resource.create("exact", metric=metric) + + body = http_client.post.call_args.kwargs["json"] + assert body["payload"]["kind"] == "inline" + + +def test_sync_create_requires_explicit_packager_for_custom_metric() -> None: + resource = EvaluatorMetricsResource(_platform(MagicMock())) + + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): + resource.create("custom", metric=cast(RuntimeMetric, _CustomRuntimeMetric())) def test_sync_retrieve_targets_item_url() -> None: diff --git a/plugins/nemo-evaluator/tests/shared/metric_bundles/test_defaults.py b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_defaults.py new file mode 100644 index 0000000000..378935aba7 --- /dev/null +++ b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_defaults.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import cast + +import pytest +from nemo_evaluator.shared.metric_bundles.bundles import MetricBundlePackagerPolicyError +from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.defaults import resolve_default_metric_bundle_packager +from nemo_evaluator.shared.metric_bundles.hybrid import HybridMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager +from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class _CustomMetric: + """A protocol-satisfying metric that is not part of MetricsUnion (not inline-bundleable).""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + +def _builtin() -> Metric: + return ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") + + +def test_defaults_inline_for_builtin_and_raises_for_custom_submit() -> None: + assert isinstance( + resolve_default_metric_bundle_packager(_builtin(), None, allow_cloudpickle_fallback=False, action="Submitting"), + InlineMetricBundlePackager, + ) + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): + resolve_default_metric_bundle_packager( + cast(Metric, _CustomMetric()), None, allow_cloudpickle_fallback=False, action="Submitting" + ) + + +def test_uses_hybrid_for_custom_local_run() -> None: + assert isinstance( + resolve_default_metric_bundle_packager( + cast(Metric, _CustomMetric()), None, allow_cloudpickle_fallback=True, action="Running" + ), + HybridMetricBundlePackager, + ) + + +def test_uses_hybrid_for_mixed_local_run_but_inline_when_all_builtin() -> None: + mixed = [_builtin(), cast(Metric, _CustomMetric())] + assert isinstance( + resolve_default_metric_bundle_packager(mixed, None, allow_cloudpickle_fallback=True, action="Running"), + HybridMetricBundlePackager, + ) + assert isinstance( + resolve_default_metric_bundle_packager( + [_builtin()], None, allow_cloudpickle_fallback=False, action="Submitting" + ), + InlineMetricBundlePackager, + ) + + +def test_honors_explicit_packager() -> None: + explicit = CloudpickleMetricBundlePackager() + assert ( + resolve_default_metric_bundle_packager( + _builtin(), explicit, allow_cloudpickle_fallback=False, action="Submitting" + ) + is explicit + ) diff --git a/plugins/nemo-evaluator/tests/shared/metric_bundles/test_hybrid.py b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_hybrid.py new file mode 100644 index 0000000000..f483e99e54 --- /dev/null +++ b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_hybrid.py @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import cast + +from nemo_evaluator.shared.metric_bundles.bundles import MetricBundle, bundle_metric, unbundle_metric +from nemo_evaluator.shared.metric_bundles.hybrid import HybridMetricBundlePackager +from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class _CustomMetric: + """A protocol-satisfying metric that is not inline-bundleable (module-level so it cloudpickles).""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + +def _roundtrip(bundle: MetricBundle) -> Metric: + return unbundle_metric(MetricBundle.model_validate_json(bundle.model_dump_json())) + + +def test_hybrid_inlines_builtin_metric() -> None: + metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") + + bundle = bundle_metric(metric, HybridMetricBundlePackager()) + + assert bundle.payload.kind == "inline" + assert type(_roundtrip(bundle)) is ExactMatchMetric + + +def test_hybrid_cloudpickles_custom_metric() -> None: + bundle = bundle_metric(cast(Metric, _CustomMetric()), HybridMetricBundlePackager()) + + assert bundle.payload.kind == "cloudpickle" + assert isinstance(_roundtrip(bundle), _CustomMetric) + + +def test_hybrid_routes_each_metric_independently() -> None: + """A mixed set bundles each metric with the lightest representation that supports it.""" + packager = HybridMetricBundlePackager() + metrics: list[Metric] = [ + ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + cast(Metric, _CustomMetric()), + ] + + kinds = [bundle_metric(metric, packager).payload.kind for metric in metrics] + + assert kinds == ["inline", "cloudpickle"] + + +def test_hybrid_load_dispatches_by_payload_kind() -> None: + packager = HybridMetricBundlePackager() + inline_bundle = bundle_metric( + ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), packager + ) + cloudpickle_bundle = bundle_metric(cast(Metric, _CustomMetric()), packager) + + assert type(packager.load(inline_bundle.payload)) is ExactMatchMetric + assert isinstance(packager.load(cloudpickle_bundle.payload), _CustomMetric) diff --git a/plugins/nemo-evaluator/tests/shared/metric_bundles/test_inline.py b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_inline.py new file mode 100644 index 0000000000..c31043ef58 --- /dev/null +++ b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_inline.py @@ -0,0 +1,208 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import hashlib +import json +from collections.abc import Sequence +from typing import cast + +import pytest +from nemo_evaluator.api.schemas import MetricInline +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundle, + MetricBundlingError, + bundle_metric, + unbundle_metric, +) +from nemo_evaluator.shared.metric_bundles.inline import ( + InlineMetricBundlePackager, + InlineMetricPayload, + inline_bundle_supported, +) +from nemo_evaluator_sdk.enums import ModelFormat +from nemo_evaluator_sdk.metrics.bleu import BLEUMetric +from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.f1 import F1Metric +from nemo_evaluator_sdk.metrics.llm_judge import LLMJudgeMetric +from nemo_evaluator_sdk.metrics.number_check import NumberCheckMetric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult +from nemo_evaluator_sdk.metrics.ragas import ( + AgentGoalAccuracyMetric, + AnswerAccuracyMetric, + ContextEntityRecallMetric, + ContextPrecisionMetric, + ContextRecallMetric, + ContextRelevanceMetric, + FaithfulnessMetric, + NoiseSensitivityMetric, + ResponseGroundednessMetric, + ResponseRelevancyMetric, + ToolCallAccuracyMetric, + TopicAdherenceMetric, +) +from nemo_evaluator_sdk.metrics.remote import NemoAgentToolkitRemoteMetric, RemoteMetric +from nemo_evaluator_sdk.metrics.rouge import ROUGEMetric +from nemo_evaluator_sdk.metrics.string_check import StringCheckMetric +from nemo_evaluator_sdk.metrics.tool_calling import ToolCallingMetric +from nemo_evaluator_sdk.values import Model, SecretRef +from nemo_evaluator_sdk.values.scores import JSONScoreParser, RangeScore, RemoteScore +from pydantic import BaseModel + + +class _CustomMetric: + """A protocol-satisfying metric that is not part of MetricsUnion.""" + + type = "custom-score" + description = "custom metric" + labels = {"source": "test"} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + +def _judge_model() -> Model: + return Model(url="https://judge.example.test/v1/chat/completions", name="judge-model", format=ModelFormat.OPEN_AI) + + +def _embeddings_model() -> Model: + return Model(url="https://judge.example.test/v1/embeddings", name="embedding-model", format=ModelFormat.OPEN_AI) + + +def _builtin_metric_cases() -> Sequence[tuple[str, Metric]]: + judge_model = _judge_model() + return [ + ("exact_match", ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}")), + ("f1", F1Metric(reference="{{item.expected}}", candidate="{{item.output}}")), + ("bleu", BLEUMetric(references=["{{item.expected}}"], candidate="{{item.output}}")), + ("rouge", ROUGEMetric(reference="{{item.expected}}", candidate="{{item.output}}")), + ( + "string_check", + StringCheckMetric( + operation="contains", left_template="{{item.output}}", right_template="{{item.expected}}" + ), + ), + ( + "number_check", + NumberCheckMetric(operation="equals", left_template="{{item.left}}", right_template="{{item.right}}"), + ), + ("tool_calling", ToolCallingMetric(reference="{{item.expected_tool_calls}}")), + ( + "llm_judge", + LLMJudgeMetric( + model=judge_model, + scores=[ + RangeScore( + name="helpfulness", minimum=1, maximum=5, parser=JSONScoreParser(json_path="helpfulness") + ) + ], + prompt_template="Judge: {{item.expected}} -> {{item.output}}", + ), + ), + ( + "remote", + RemoteMetric( + url="https://remote.example.test", + body={"prompt": "{{item.prompt}}"}, + scores=[RemoteScore(name="quality", parser=JSONScoreParser(json_path="$.result.quality"))], + ), + ), + ( + "nemo_agent_toolkit_remote", + NemoAgentToolkitRemoteMetric(url="https://remote.example.test", evaluator_name="nat-quality"), + ), + ("topic_adherence", TopicAdherenceMetric(metric_mode="f1", judge_model=judge_model)), + ("tool_call_accuracy", ToolCallAccuracyMetric()), + ("agent_goal_accuracy", AgentGoalAccuracyMetric(judge_model=judge_model)), + ("answer_accuracy", AnswerAccuracyMetric(judge_model=judge_model)), + ("context_relevance", ContextRelevanceMetric(judge_model=judge_model)), + ("response_groundedness", ResponseGroundednessMetric(judge_model=judge_model)), + ("context_recall", ContextRecallMetric(judge_model=judge_model)), + ("context_precision", ContextPrecisionMetric(judge_model=judge_model)), + ("context_entity_recall", ContextEntityRecallMetric(judge_model=judge_model)), + ("response_relevancy", ResponseRelevancyMetric(judge_model=judge_model, embeddings_model=_embeddings_model())), + ("faithfulness", FaithfulnessMetric(judge_model=judge_model)), + ("noise_sensitivity", NoiseSensitivityMetric(judge_model=judge_model)), + ] + + +_CASES = _builtin_metric_cases() +_CASE_IDS = [name for name, _ in _CASES] + + +@pytest.mark.parametrize(("case_name", "metric"), _CASES, ids=_CASE_IDS) +def test_inline_packager_round_trips_every_builtin_metric(case_name: str, metric: Metric) -> None: + """Every built-in metric serializes inline and reconstructs to an identical object.""" + bundle = bundle_metric(metric, InlineMetricBundlePackager()) + + # Full wire round-trip: runtime bundle -> JSON -> runtime bundle. + restored = MetricBundle.model_validate_json(bundle.model_dump_json()) + hydrated = unbundle_metric(restored) + + assert restored.payload.kind == "inline", case_name + assert restored.metric_type == metric.type, case_name + assert type(hydrated) is type(metric), case_name + # Inline reconstruction must be lossless at the config level (not just the type). + assert cast(BaseModel, hydrated).model_dump() == cast(BaseModel, metric).model_dump(), case_name + assert [o.name for o in hydrated.output_spec()] == [o.name for o in metric.output_spec()], case_name + + +@pytest.mark.parametrize(("case_name", "metric"), _CASES, ids=_CASE_IDS) +def test_inline_payload_passes_through_wire_dto(case_name: str, metric: Metric) -> None: + """The inline payload survives the MetricInline wire DTO (OpenAPI contract).""" + bundle = bundle_metric(metric, InlineMetricBundlePackager()) + + wire = MetricInline.model_validate_json(bundle.model_dump_json()) + + assert wire.payload.kind == "inline", case_name + assert wire.metric_type + assert wire.outputs + # Re-validating the wire DTO JSON back into a runtime bundle must still hydrate. + runtime_again = MetricBundle.model_validate_json(wire.model_dump_json()) + assert type(unbundle_metric(runtime_again)) is type(metric), case_name + + +def test_inline_payload_digest_is_canonical_sha256() -> None: + metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") + bundle = bundle_metric(metric, InlineMetricBundlePackager()) + payload = InlineMetricPayload.model_validate(bundle.payload.model_dump(mode="python")) + + expected = hashlib.sha256( + json.dumps(payload.metric, sort_keys=True, separators=(",", ":")).encode("utf-8") + ).hexdigest() + assert payload.digest == expected + serialized = cast(dict[str, object], bundle.model_dump(mode="json")["payload"]) + assert serialized["kind"] == "inline" + assert serialized["digest"] == expected + + +def test_inline_packager_rejects_custom_metric() -> None: + with pytest.raises(MetricBundlingError, match="CloudpickleMetricBundlePackager"): + bundle_metric(cast(Metric, _CustomMetric()), InlineMetricBundlePackager()) + + +def test_inline_bundle_supported_classifies_metrics() -> None: + assert inline_bundle_supported(ExactMatchMetric(reference="{{item.expected}}")) + assert not inline_bundle_supported(cast(Metric, _CustomMetric())) + + +def test_inline_captures_metric_secrets() -> None: + metric = LLMJudgeMetric( + model=Model( + url="https://judge.example.test/v1/chat/completions", + name="judge-model", + api_key_secret=SecretRef(root="judge-secret"), + format=ModelFormat.OPEN_AI, + ), + scores=[RangeScore(name="helpfulness", minimum=1, maximum=5, parser=JSONScoreParser(json_path="helpfulness"))], + ) + + bundle = bundle_metric(metric, InlineMetricBundlePackager()) + restored = MetricBundle.model_validate_json(bundle.model_dump_json()) + + assert restored.secrets == {"judge_secret": SecretRef(root="judge-secret")} diff --git a/plugins/nemo-evaluator/tests/test_inline_bundle_execution.py b/plugins/nemo-evaluator/tests/test_inline_bundle_execution.py new file mode 100644 index 0000000000..2ac18a4c27 --- /dev/null +++ b/plugins/nemo-evaluator/tests/test_inline_bundle_execution.py @@ -0,0 +1,163 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""End-to-end execution tests for inline-bundled metrics. + +These tests run real metric scoring — no mocks. The end-to-end cases drive the +full evaluator job (`EvaluateJob`) through the local scheduler, exercising the +complete inline path: bundle -> MetricInline wire DTO -> job spec -> unbundle +(reconstruct from config) -> execute -> aggregate scores. The reconstruction +tests round-trip each metric through the bundle and then actually invoke the +hydrated metric's `compute_scores`. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, cast + +import pytest +from nemo_evaluator.jobs.evaluate import EvaluateJob +from nemo_evaluator.shared.metric_bundles.bundles import bundle_metric, unbundle_metric +from nemo_evaluator.shared.metric_bundles.hybrid import HybridMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager +from nemo_evaluator_sdk.execution.samples import build_metric_input +from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.f1 import F1Metric +from nemo_evaluator_sdk.metrics.number_check import NumberCheckMetric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult +from nemo_evaluator_sdk.metrics.string_check import StringCheckMetric +from nemo_platform_plugin.scheduler import NemoJobScheduler + + +class _CustomConstantMetric: + """Module-level custom metric (cloudpicklable) that always scores 1.0.""" + + type = "custom-constant" + description = "custom constant metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("constant")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="constant", value=1.0)]) + + +def _inline_payload(metric: Metric) -> dict[str, Any]: + """Bundle a metric inline and project it to the job-spec wire shape.""" + return bundle_metric(metric, InlineMetricBundlePackager()).model_dump(mode="json") + + +def _load_artifact_payload(run_result: dict[str, Any]) -> dict[str, Any]: + artifact_path = Path(run_result["artifact"]["artifact_url"].removeprefix("file://")) + return cast(dict[str, Any], json.loads(artifact_path.read_text(encoding="utf-8"))) + + +def _aggregate_scores(run_result: dict[str, Any]) -> list[dict[str, Any]]: + return cast(list[dict[str, Any]], _load_artifact_payload(run_result)["aggregate_scores"]["scores"]) + + +def test_evaluate_job_runs_inline_bundled_exact_match_metric() -> None: + """Full job run with an inline-bundled metric produces real aggregate scores.""" + spec = { + "metrics": [ + _inline_payload(ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.model_output}}")) + ], + "dataset": [ + {"expected": "blue", "model_output": "Blue"}, # normalizes equal -> 1.0 + {"expected": "Jupiter", "model_output": "Saturn"}, # -> 0.0 + ], + "params": {"parallelism": 2}, + } + + result = NemoJobScheduler().run_local(EvaluateJob, spec) + + scores = _aggregate_scores(result) + assert scores[0]["name"] == "exact-match.exact-match" + assert scores[0]["mean"] == 0.5 + + +def test_evaluate_job_runs_multiple_inline_metrics() -> None: + """Multiple inline-bundled metrics in one job each execute and aggregate.""" + spec = { + "metrics": [ + _inline_payload(ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.model_output}}")), + _inline_payload( + StringCheckMetric( + operation="contains", + left_template="{{item.model_output}}", + right_template="{{item.expected}}", + ) + ), + ], + "dataset": [ + {"expected": "Paris", "model_output": "Paris"}, + {"expected": "Paris", "model_output": "London"}, + ], + "params": {"parallelism": 2}, + } + + result = NemoJobScheduler().run_local(EvaluateJob, spec) + + by_name = {score["name"]: score for score in _aggregate_scores(result)} + assert by_name["exact-match.exact-match"]["mean"] == 0.5 + assert by_name["string-check.string-check"]["mean"] == 0.5 + + +def test_evaluate_job_runs_hybrid_bundled_mixed_metrics() -> None: + """Hybrid bundling: built-in goes inline, custom is cloudpickled, and both execute in one job.""" + packager = HybridMetricBundlePackager() + builtin_payload = bundle_metric( + ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.model_output}}"), packager + ).model_dump(mode="json") + custom_payload = bundle_metric(cast(Metric, _CustomConstantMetric()), packager).model_dump(mode="json") + + # The built-in stays inline (no Python-version coupling); only the custom metric is cloudpickled. + assert builtin_payload["payload"]["kind"] == "inline" + assert custom_payload["payload"]["kind"] == "cloudpickle" + + spec = { + "metrics": [builtin_payload, custom_payload], + "dataset": [ + {"expected": "blue", "model_output": "Blue"}, # exact-match -> 1.0 + {"expected": "Jupiter", "model_output": "Saturn"}, # exact-match -> 0.0 + ], + "params": {"parallelism": 2}, + } + + result = NemoJobScheduler().run_local(EvaluateJob, spec) + + by_name = {score["name"]: score for score in _aggregate_scores(result)} + assert by_name["exact-match.exact-match"]["mean"] == 0.5 + assert by_name["custom-constant.constant"]["mean"] == 1.0 + + +@pytest.mark.asyncio +async def test_round_tripped_deterministic_metrics_execute_identically() -> None: + """After an inline bundle round-trip, hydrated metrics score identically to the originals.""" + item = {"expected": "the answer is 42", "left": "42", "right": "42.0"} + sample = {"output_text": "The answer is 42!"} + + metrics: list[Metric] = [ + ExactMatchMetric(reference="{{item.expected}}", candidate="{{sample.output_text}}"), + F1Metric(reference="{{item.expected}}", candidate="{{sample.output_text}}"), + NumberCheckMetric(operation="equals", left_template="{{item.left}}", right_template="{{item.right}}"), + StringCheckMetric( + operation="contains", left_template="{{sample.output_text}}", right_template="{{item.expected}}" + ), + ] + + for metric in metrics: + hydrated = unbundle_metric(bundle_metric(metric, InlineMetricBundlePackager())) + + original_result = await metric.compute_scores(build_metric_input(item, sample)) + hydrated_result = await hydrated.compute_scores(build_metric_input(item, sample)) + + original_values = [(o.name, o.value) for o in original_result.outputs] + hydrated_values = [(o.name, o.value) for o in hydrated_result.outputs] + assert hydrated_values == original_values, type(metric).__name__ + # Sanity: the deterministic scorers actually produced a score. + assert hydrated_result.outputs, type(metric).__name__ diff --git a/plugins/nemo-evaluator/tests/test_sdk.py b/plugins/nemo-evaluator/tests/test_sdk.py index b83a42d3d1..6b3f90bd3b 100644 --- a/plugins/nemo-evaluator/tests/test_sdk.py +++ b/plugins/nemo-evaluator/tests/test_sdk.py @@ -31,8 +31,9 @@ bundle_metric, ) from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric -from nemo_evaluator_sdk.metrics.protocol import Metric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult from nemo_evaluator_sdk.values import FieldMapping, Model, ModelRef, RunConfig, RunConfigOnline, RunConfigOnlineModel from nemo_evaluator_sdk.values.results import AggregatedMetricResult, EvaluationResult from nemo_platform import AsyncNeMoPlatform, NeMoPlatform @@ -98,6 +99,21 @@ def load(self, payload: MetricBundlePayload) -> Metric: raise NotImplementedError("test packager only exercises submission-side packaging") +class _CustomRuntimeMetric: + """A protocol-satisfying metric that is not part of MetricsUnion (not inline-bundleable).""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + class _SyncPlatform: def __init__(self) -> None: self.base_url = "http://test:8000" @@ -215,6 +231,29 @@ def test_build_evaluate_spec_requires_metric_bundle_packager() -> None: ) +def test_local_run_allows_cloudpickle_fallback_for_custom_metric(mocker: MockerFixture) -> None: + """Local run() executes in the caller's process, so custom metrics fall back to cloudpickle. + + The fallback is enabled only for local execution; remote submit/create still require an + explicit cloudpickle opt-in (covered separately). + """ + import nemo_evaluator.sdk._executor as executor_module + + spy = mocker.spy(executor_module, "resolve_default_metric_bundle_packager") + resource = Evaluator(cast(NeMoPlatform, _SyncPlatform())) + # Short-circuit after packaging so we don't drive the local job runtime. + mocker.patch.object(resource._executor, "run_local", side_effect=RuntimeError("stop after packaging")) + + with pytest.raises(RuntimeError, match="stop after packaging"): + resource.run( + metric=cast(Metric, _CustomRuntimeMetric()), + dataset=[{"expected": "a", "output": "a"}], + ) + + # No MetricBundlePackagerPolicyError: the custom metric was bundled (via cloudpickle) and reached execution. + assert spy.call_args.kwargs["allow_cloudpickle_fallback"] is True + + def test_build_evaluate_spec_includes_target_and_prompt_template() -> None: """Online evaluator specs should preserve model targets and prompt templates.""" model = Model(url="https://model.test/v1", name="model-a") @@ -590,13 +629,27 @@ def test_accepts_model_ref_target(self, mocker: MockerFixture) -> None: metric_bundle_packager=packager, ) - def test_requires_metric_bundle_packager(self) -> None: - """Submit should fail fast before delegating without a remote metric packager.""" + def test_defaults_to_inline_packager_for_builtin_metric(self, mocker: MockerFixture) -> None: + """Submit of a built-in metric without an explicit packager defaults to inline bundling.""" resource = Evaluator(cast(NeMoPlatform, _SyncPlatform())) + expected_job = mocker.Mock(spec=EvaluatorJobResource) + submit = mocker.patch.object(resource._executor, "submit", return_value=expected_job) - with pytest.raises(ValueError, match="metric_bundle_packager is required"): + job = resource.submit( + metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + dataset=[{"expected": "a", "output": "a"}], + ) + + assert job is expected_job + assert isinstance(submit.call_args.kwargs["metric_bundle_packager"], InlineMetricBundlePackager) + + def test_requires_explicit_packager_for_custom_metric(self) -> None: + """Submit of a custom metric requires an explicit cloudpickle opt-in.""" + resource = Evaluator(cast(NeMoPlatform, _SyncPlatform())) + + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): resource.submit( - metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + metric=cast(Metric, _CustomRuntimeMetric()), dataset=[{"expected": "a", "output": "a"}], ) @@ -1071,13 +1124,28 @@ async def test_accepts_model_ref_target(self, mocker: MockerFixture) -> None: ) @pytest.mark.asyncio - async def test_requires_metric_bundle_packager(self) -> None: - """Submit should fail fast before delegating without a remote metric packager.""" + async def test_defaults_to_inline_packager_for_builtin_metric(self, mocker: MockerFixture) -> None: + """Async submit of a built-in metric defaults to inline bundling.""" + resource = AsyncEvaluator(cast(AsyncNeMoPlatform, _AsyncPlatform())) + expected_job = mocker.Mock(spec=AsyncEvaluatorJobResource) + submit = mocker.patch.object(resource._executor, "submit", new=AsyncMock(return_value=expected_job)) + + job = await resource.submit( + metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + dataset=[{"expected": "a", "output": "a"}], + ) + + assert job is expected_job + assert isinstance(submit.call_args.kwargs["metric_bundle_packager"], InlineMetricBundlePackager) + + @pytest.mark.asyncio + async def test_requires_explicit_packager_for_custom_metric(self) -> None: + """Async submit of a custom metric requires an explicit cloudpickle opt-in.""" resource = AsyncEvaluator(cast(AsyncNeMoPlatform, _AsyncPlatform())) - with pytest.raises(ValueError, match="metric_bundle_packager is required"): + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): await resource.submit( - metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + metric=cast(Metric, _CustomRuntimeMetric()), dataset=[{"expected": "a", "output": "a"}], )