From 5c3e1015dadba4397da57ec493d0f2801f4a3783 Mon Sep 17 00:00:00 2001 From: Sandy Chapman Date: Wed, 24 Jun 2026 13:10:41 -0300 Subject: [PATCH 1/2] feat(evaluator): inline + hybrid metric bundlers; built-ins skip cloudpickle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Built-in evaluator metrics can now bundle as their own JSON configuration instead of a cloudpickle blob, so durable jobs no longer require an explicit metric_bundle_packager and avoid the Python-version coupling of cloudpickle payloads. Reconstruction is pure data validation via the MetricsUnion discriminated union — no arbitrary code runs on load. New bundlers (shared/metric_bundles/): - inline.py: InlineMetricPayload + InlineMetricBundlePackager (kind="inline"). - hybrid.py: HybridMetricBundlePackager — inline per metric, cloudpickle only for metrics that cannot be reconstructed from config. - defaults.py: resolve_default_metric_bundle_packager — selection policy. Default behavior: - Built-in metrics bundle inline everywhere (run/submit/create); no packager needed. - Local run() of a custom metric falls back to cloudpickle automatically (in-process; opt out via the executor's allow_cloudpickle_fallback=False). - Durable submit()/metric create() of a custom metric require an explicit packager — shipping arbitrary code to the shared service stays opt-in. Wire contract: api/schemas.py adds InlineMetricPayload to the MetricPayload discriminated union; openapi spec regenerated (make refresh-openapi). The hand-written evaluator SDK is updated accordingly; no Stainless regen needed. Docs: drop the now-unnecessary CloudpickleMetricBundlePackager from built-in submit() examples; keep it only in the custom-Python-metrics tutorial (points to Hybrid as the recommended packager). ModelRef example keeps config=RunConfigOnlineModel(). Fix two illustrative fragment blocks so all 261 doc code blocks compile, and update the offline contract test to the new behavior. Validated: 305 evaluator unit tests; offline contract test (8); deterministic submit/run examples and the LLM-judge tutorial run end-to-end against a live platform (packager-free durable submit completed against a real judge model). Co-Authored-By: Claude Opus 4.8 Signed-off-by: Sandy Chapman --- docs/evaluator/index.mdx | 2 - .../evaluator/metrics/agent-configuration.mdx | 2 - docs/evaluator/metrics/agentic.mdx | 14 -- docs/evaluator/metrics/job-management.mdx | 2 - docs/evaluator/metrics/llm-as-a-judge.mdx | 20 +- docs/evaluator/metrics/manage-metrics.mdx | 2 - .../evaluator/metrics/model-configuration.mdx | 2 - docs/evaluator/metrics/rag.mdx | 40 +--- docs/evaluator/metrics/remote.mdx | 4 - docs/evaluator/metrics/results.mdx | 2 - docs/evaluator/metrics/similarity.mdx | 20 +- docs/evaluator/sdk-resources.mdx | 14 +- docs/evaluator/test_doc_examples.py | 49 ++++- .../tutorials/run-llm-judge-evaluation.mdx | 4 - plugins/nemo-evaluator/openapi/openapi.yaml | 36 +++ .../src/nemo_evaluator/api/schemas.py | 27 ++- .../src/nemo_evaluator/jobs/evaluate.py | 5 +- .../src/nemo_evaluator/metric_storage.py | 5 +- .../src/nemo_evaluator/sdk/_executor.py | 29 ++- .../nemo_evaluator/sdk/metric_resources.py | 11 +- .../src/nemo_evaluator/sdk/resources.py | 19 +- .../shared/metric_bundles/bundles.py | 4 + .../shared/metric_bundles/defaults.py | 66 ++++++ .../shared/metric_bundles/hybrid.py | 48 ++++ .../shared/metric_bundles/inline.py | 107 +++++++++ .../tests/sdk/test_metric_sdk_resources.py | 45 +++- .../shared/metric_bundles/test_defaults.py | 78 +++++++ .../shared/metric_bundles/test_hybrid.py | 70 ++++++ .../shared/metric_bundles/test_inline.py | 208 ++++++++++++++++++ .../tests/test_inline_bundle_execution.py | 163 ++++++++++++++ plugins/nemo-evaluator/tests/test_sdk.py | 86 +++++++- 31 files changed, 1014 insertions(+), 170 deletions(-) create mode 100644 plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/defaults.py create mode 100644 plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/hybrid.py create mode 100644 plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/inline.py create mode 100644 plugins/nemo-evaluator/tests/shared/metric_bundles/test_defaults.py create mode 100644 plugins/nemo-evaluator/tests/shared/metric_bundles/test_hybrid.py create mode 100644 plugins/nemo-evaluator/tests/shared/metric_bundles/test_inline.py create mode 100644 plugins/nemo-evaluator/tests/test_inline_bundle_execution.py diff --git a/docs/evaluator/index.mdx b/docs/evaluator/index.mdx index b1d8f5f6cc..376fc17a26 100644 --- a/docs/evaluator/index.mdx +++ b/docs/evaluator/index.mdx @@ -59,7 +59,6 @@ Submit your evaluation to the Evaluator service using the NeMo Platform SDK: ```python from nemo_evaluator.sdk import Evaluator -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager from nemo_platform import NeMoPlatform @@ -74,7 +73,6 @@ job = evaluator.submit( metric=metric, dataset=dataset, config=config, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() diff --git a/docs/evaluator/metrics/agent-configuration.mdx b/docs/evaluator/metrics/agent-configuration.mdx index ca855031c9..012bc52d59 100644 --- a/docs/evaluator/metrics/agent-configuration.mdx +++ b/docs/evaluator/metrics/agent-configuration.mdx @@ -164,7 +164,6 @@ from nemo_evaluator_sdk import Agent, RunConfigOnline from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected_answer}}") agent = Agent( url="https://my-nat-agent.example.com", @@ -185,7 +184,6 @@ job = evaluator.submit( {"role": "user", "content": "{{item.question}}"}, ], }, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() diff --git a/docs/evaluator/metrics/agentic.mdx b/docs/evaluator/metrics/agentic.mdx index d45e4c0a05..269c1caea6 100644 --- a/docs/evaluator/metrics/agentic.mdx +++ b/docs/evaluator/metrics/agentic.mdx @@ -228,7 +228,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig from nemo_evaluator_sdk.metrics.ragas import ToolCallAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ToolCallAccuracyMetric() job = evaluator.submit( @@ -251,7 +250,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -430,7 +428,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, ToolCallingMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ToolCallingMetric(reference="{{item.tool_calls}}") @@ -465,7 +462,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -572,7 +568,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, Model from nemo_evaluator_sdk.metrics.ragas import TopicAdherenceMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -596,7 +591,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -768,7 +762,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, Model from nemo_evaluator_sdk.metrics.ragas import AgentGoalAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -798,7 +791,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -925,7 +917,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, Model from nemo_evaluator_sdk.metrics.ragas import AgentGoalAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -963,7 +954,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -1023,7 +1013,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfig, Model from nemo_evaluator_sdk.metrics.ragas import AnswerAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -1042,7 +1031,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -1055,7 +1043,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import RunConfigOnlineModel, InferenceParams, Model from nemo_evaluator_sdk.metrics.ragas import AnswerAccuracyMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager judge_model = Model( url="https://integrate.api.nvidia.com/v1/chat/completions", @@ -1090,7 +1077,6 @@ job = evaluator.submit( } ] }, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() diff --git a/docs/evaluator/metrics/job-management.mdx b/docs/evaluator/metrics/job-management.mdx index 20391c2aef..65fa3fa576 100644 --- a/docs/evaluator/metrics/job-management.mdx +++ b/docs/evaluator/metrics/job-management.mdx @@ -21,7 +21,6 @@ from nemo_evaluator.sdk import Evaluator from nemo_platform import NeMoPlatform from nemo_evaluator_sdk import RunConfig from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager sdk = NeMoPlatform( base_url=os.environ.get("NMP_BASE_URL", "http://localhost:8080"), @@ -38,7 +37,6 @@ job = evaluator.submit( {"expected": "Berlin", "output": "Munich"}, ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print("Submitted job:", job.name) diff --git a/docs/evaluator/metrics/llm-as-a-judge.mdx b/docs/evaluator/metrics/llm-as-a-judge.mdx index 935ec30bd7..886be51df2 100644 --- a/docs/evaluator/metrics/llm-as-a-judge.mdx +++ b/docs/evaluator/metrics/llm-as-a-judge.mdx @@ -298,7 +298,6 @@ For production workloads, submit the same metric and dataset as a durable platfo ```python from nemo_evaluator_sdk import RunConfig, JSONScoreParser, Model, RubricScore, LLMJudgeMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = LLMJudgeMetric( model=Model( @@ -347,7 +346,6 @@ job = evaluator.submit( {"input": "What is 2 + 2?", "output": "4"}, ], config=RunConfig(parallelism=8, limit_samples=100), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print("Submitted job:", job.name) @@ -436,13 +434,13 @@ By default, the JSON parser is used for range and rubric scores, with the score ```python # JSON parser (default) -"parser": {"type": "json", "json_path": "quality"} +parser = {"type": "json", "json_path": "quality"} # Regex parser (for models that do not support structured output) -"parser": {"type": "regex", "pattern": "QUALITY: (\\w+)"} +parser = {"type": "regex", "pattern": "QUALITY: (\\w+)"} # Regex parser with method='search' (finds pattern anywhere in text) -"parser": {"type": "regex", "pattern": "SCORE: (\\d+)", "method": "search"} +parser = {"type": "regex", "pattern": "SCORE: (\\d+)", "method": "search"} ``` @@ -642,15 +640,13 @@ metric = { Control judge model behavior with inference parameters: ```python - -"prompt_template": { +prompt_template = { "messages": [...], - "temperature": 0.1, # Lower for more consistent scoring - "max_tokens": 1024, # Increase if judge needs more space - "timeout": 30, # Request timeout in seconds - "stop": ["<{{ end_of_text }}>"] # Stop sequences + "temperature": 0.1, # Lower for more consistent scoring + "max_tokens": 1024, # Increase if judge needs more space + "timeout": 30, # Request timeout in seconds + "stop": ["<{{ end_of_text }}>"], # Stop sequences } - ``` diff --git a/docs/evaluator/metrics/manage-metrics.mdx b/docs/evaluator/metrics/manage-metrics.mdx index 79d051fa10..0784c869c5 100644 --- a/docs/evaluator/metrics/manage-metrics.mdx +++ b/docs/evaluator/metrics/manage-metrics.mdx @@ -98,7 +98,6 @@ For online evaluations, provide a model or agent target and use the online param ```python from nemo_evaluator_sdk import RunConfig, ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") @@ -109,7 +108,6 @@ job = evaluator.submit( {"expected": "Berlin", "output": "Munich"}, ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() diff --git a/docs/evaluator/metrics/model-configuration.mdx b/docs/evaluator/metrics/model-configuration.mdx index 404378ac3d..537a41e5f0 100644 --- a/docs/evaluator/metrics/model-configuration.mdx +++ b/docs/evaluator/metrics/model-configuration.mdx @@ -208,14 +208,12 @@ Durable remote `evaluator.submit(...)` jobs additionally accept a `ModelRef` tar ```python from nemo_evaluator_sdk import ModelRef, RunConfigOnlineModel -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager job = evaluator.submit( metric=metric, dataset=dataset, config=RunConfigOnlineModel(), target=ModelRef(root="default/my-model"), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) ``` diff --git a/docs/evaluator/metrics/rag.mdx b/docs/evaluator/metrics/rag.mdx index 68bcd58a5f..5502ca3a81 100644 --- a/docs/evaluator/metrics/rag.mdx +++ b/docs/evaluator/metrics/rag.mdx @@ -61,13 +61,7 @@ evaluator: Evaluator = client.evaluator # this object is an Evaluator resource Use `evaluator.run(metric=metric, dataset=dataset)` for a local synchronous evaluation. Use `evaluator.submit(metric=metric, dataset=dataset)` when you need a durable remote job: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - -job = evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), -) +job = evaluator.submit(metric=metric, dataset=dataset) job.wait_until_done() result = job.get_result() ``` @@ -226,15 +220,12 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ContextRecallMetric(judge_model=judge_model) job = evaluator.submit( metric=metric, dataset=offline_rows, config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -305,15 +296,12 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ContextPrecisionMetric(judge_model=judge_model) job = evaluator.submit( metric=metric, dataset=offline_rows, config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -387,8 +375,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ContextRelevanceMetric(judge_model=judge_model) job = evaluator.submit( @@ -400,7 +386,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -455,8 +440,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ContextEntityRecallMetric(judge_model=judge_model) job = evaluator.submit( @@ -468,7 +451,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -533,8 +515,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = FaithfulnessMetric(judge_model=judge_model) job = evaluator.submit( @@ -543,7 +523,6 @@ job = evaluator.submit( config=online_config, target=generation_model, prompt_template=online_prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -607,8 +586,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ResponseGroundednessMetric(judge_model=judge_model) job = evaluator.submit( @@ -617,7 +594,6 @@ job = evaluator.submit( config=online_config, target=generation_model, prompt_template=online_prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -680,8 +656,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = NoiseSensitivityMetric(judge_model=judge_model) job = evaluator.submit( @@ -698,7 +672,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -777,8 +750,6 @@ for score in result.aggregate_scores.scores: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - metric = ResponseRelevancyMetric( judge_model=judge_model, embeddings_model=embeddings_model, @@ -791,7 +762,6 @@ job = evaluator.submit( config=online_config, target=generation_model, prompt_template=online_prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -915,13 +885,7 @@ judge_model = Model( For durable remote execution, submit the same metric and dataset that you tested locally: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - -job = evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), -) +job = evaluator.submit(metric=metric, dataset=dataset) job.wait_until_done() artifacts_dir = job.download_artifacts(path="evaluation_artifacts") print(f"Saved artifacts under {artifacts_dir}") diff --git a/docs/evaluator/metrics/remote.mdx b/docs/evaluator/metrics/remote.mdx index b14f66bdc1..8886217f4d 100644 --- a/docs/evaluator/metrics/remote.mdx +++ b/docs/evaluator/metrics/remote.mdx @@ -155,7 +155,6 @@ For production workloads, submit the same metric and dataset as a durable platfo ```python from nemo_evaluator_sdk import RunConfig, JSONScoreParser, RemoteScore, RemoteMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = RemoteMetric( url="https://my-evaluation-server.test/evaluate", @@ -183,7 +182,6 @@ job = evaluator.submit( {"reference": "2", "output": "2"}, ], config=RunConfig(parallelism=8), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print("Submitted job:", job.name) @@ -199,7 +197,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import RunConfig, NemoAgentToolkitRemoteMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = NemoAgentToolkitRemoteMetric( url="http://localhost:8001/evaluate_item", @@ -223,7 +220,6 @@ job = evaluator.submit( } ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() diff --git a/docs/evaluator/metrics/results.mdx b/docs/evaluator/metrics/results.mdx index 41bc7d82e6..07a1b333ea 100644 --- a/docs/evaluator/metrics/results.mdx +++ b/docs/evaluator/metrics/results.mdx @@ -47,7 +47,6 @@ result = evaluator.run( ```python from nemo_evaluator_sdk import RunConfig, ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") @@ -58,7 +57,6 @@ job = evaluator.submit( {"expected": "Berlin", "output": "Munich"}, ], config=RunConfig(parallelism=4), - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() ``` diff --git a/docs/evaluator/metrics/similarity.mdx b/docs/evaluator/metrics/similarity.mdx index 3be35bf736..75138076e1 100644 --- a/docs/evaluator/metrics/similarity.mdx +++ b/docs/evaluator/metrics/similarity.mdx @@ -31,13 +31,7 @@ evaluator: Evaluator = sdk.evaluator # this object is an Evaluator resource Use `evaluator.run(metric=metric, dataset=dataset)` for a local synchronous evaluation. Use `evaluator.submit(metric=metric, dataset=dataset)` when you need a durable remote job: ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - -job = evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), -) +job = evaluator.submit(metric=metric, dataset=dataset) job.wait_until_done() result = job.get_result() ``` @@ -111,7 +105,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import BLEUMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = BLEUMetric( references=["{{item.reference_1}}", "{{item.reference_2}}"], @@ -133,7 +126,6 @@ job = evaluator.submit( "model_output": "Hello world!", }, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -206,7 +198,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric( reference="{{item.correct_answer | lower | trim}}", @@ -220,7 +211,6 @@ job = evaluator.submit( {"correct_answer": "London", "model_answer": "london "}, {"correct_answer": "Berlin", "model_answer": "Munich"}, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -289,7 +279,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import F1Metric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = F1Metric( reference="{{item.reference}}", @@ -305,7 +294,6 @@ job = evaluator.submit( }, {"reference": "a red apple", "answer": "red apple"}, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -382,7 +370,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import NumberCheckMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = NumberCheckMetric( operation=">", @@ -399,7 +386,6 @@ job = evaluator.submit( {"predicted": "0.5"}, {"predicted": "0.1"}, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -471,7 +457,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import ROUGEMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ROUGEMetric( reference="{{item.reference_summary}}", @@ -490,7 +475,6 @@ job = evaluator.submit( "model_summary": "High winds delayed the launch.", }, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() @@ -579,7 +563,6 @@ for score in result.aggregate_scores.scores: ```python from nemo_evaluator_sdk import StringCheckMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = StringCheckMetric( operation="startswith", @@ -595,7 +578,6 @@ job = evaluator.submit( {"output": "Answer: Success"}, {"output": "Error occurred"}, ], - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) job.wait_until_done() result = job.get_result() diff --git a/docs/evaluator/sdk-resources.mdx b/docs/evaluator/sdk-resources.mdx index b1ed7d36e6..ca0ee6d708 100644 --- a/docs/evaluator/sdk-resources.mdx +++ b/docs/evaluator/sdk-resources.mdx @@ -88,7 +88,6 @@ print(result.aggregate_scores) ```python from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") @@ -97,11 +96,7 @@ dataset = [ {"expected": "Berlin", "output": "Munich"}, ] -job = evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), -) +job = evaluator.submit(metric=metric, dataset=dataset) job.wait_until_done() result = job.get_result() print(result.aggregate_scores) @@ -143,7 +138,6 @@ evaluator: AsyncEvaluator = client.evaluator import asyncio from nemo_evaluator_sdk import ExactMatchMetric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") @@ -154,11 +148,7 @@ dataset = [ async def main() -> None: - job = await evaluator.submit( - metric=metric, - dataset=dataset, - metric_bundle_packager=CloudpickleMetricBundlePackager(), - ) + job = await evaluator.submit(metric=metric, dataset=dataset) await job.wait_until_done() result = await job.get_result() print(result.aggregate_scores) diff --git a/docs/evaluator/test_doc_examples.py b/docs/evaluator/test_doc_examples.py index 2d9f337a1f..59e05fcdd3 100644 --- a/docs/evaluator/test_doc_examples.py +++ b/docs/evaluator/test_doc_examples.py @@ -10,9 +10,10 @@ import paths and call contract that every runnable doc snippet relies on, so the docs cannot silently drift from the SDK again. -These checks run fully offline: they exercise import locations and the -client-side argument validation in ``Evaluator.submit`` / ``AsyncEvaluator.submit``. -They do not submit jobs and need no running platform or model credentials. +These checks run fully offline: they exercise import locations and the packager +contract for ``Evaluator.submit`` — built-in metrics bundle inline and need no +packager, while custom metrics require an explicit one. They do not submit jobs +and need no running platform or model credentials. Run directly: uv run python docs/evaluator/test_doc_examples.py @@ -27,9 +28,26 @@ import pytest from nemo_evaluator.sdk import Evaluator +from nemo_evaluator.shared.metric_bundles.bundles import MetricBundlePackagerPolicyError +from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult from nemo_platform import NeMoPlatform +class _CustomMetric: + """A metric that is not a built-in type (cannot be reconstructed from config).""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + def test_filesetref_imports_from_platform_sdk() -> None: """Docs import ``FilesetRef`` from ``nemo_evaluator.sdk`` (platform helpers).""" from nemo_evaluator.sdk import FilesetRef @@ -84,16 +102,35 @@ def test_packager_param_is_submit_only() -> None: assert "metric_bundle_packager" not in run_params -def test_submit_requires_metric_bundle_packager() -> None: - """``submit()`` without a packager raises the documented ValueError, offline.""" +def test_builtin_submit_does_not_require_a_packager() -> None: + """Built-in metrics bundle inline, so docs omit the packager on ``submit()``. + + Submitting reaches the executor (which then needs a live service); the point + is only that no packager-policy error is raised for a built-in metric. + """ from nemo_evaluator_sdk import ExactMatchMetric evaluator = _evaluator() metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") dataset = [{"expected": "Paris", "output": "Paris"}] - with pytest.raises(ValueError, match="metric_bundle_packager is required"): + try: evaluator.submit(metric=metric, dataset=dataset) + except MetricBundlePackagerPolicyError as error: # pragma: no cover - regression guard + pytest.fail(f"built-in submit should not require a packager: {error}") + except Exception: + # Any other failure (e.g. connection refused with no live service) is fine; + # it means the built-in metric was bundled inline and reached execution. + pass + + +def test_custom_submit_requires_an_explicit_packager() -> None: + """Custom (non-built-in) metrics still require an explicit packager for durable submit.""" + evaluator = _evaluator() + dataset = [{"expected": "Paris", "output": "Paris"}] + + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): + evaluator.submit(metric=_CustomMetric(), dataset=dataset) def test_run_does_not_require_metric_bundle_packager() -> None: diff --git a/docs/evaluator/tutorials/run-llm-judge-evaluation.mdx b/docs/evaluator/tutorials/run-llm-judge-evaluation.mdx index f66164f258..22e1aed14e 100644 --- a/docs/evaluator/tutorials/run-llm-judge-evaluation.mdx +++ b/docs/evaluator/tutorials/run-llm-judge-evaluation.mdx @@ -364,8 +364,6 @@ The first response is comprehensive and helpful, while the second is unhelpfully Now let's evaluate a larger sample and compare the judge's predictions against human annotations. This tells us how well our judge aligns with human judgment. ```python -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager - sample_config = RunConfig( parallelism=1, limit_samples=5, @@ -396,7 +394,6 @@ job_v1 = evaluator.submit( metric=metric_v1_remote, dataset=dataset_ref, config=sample_config, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print(f"Job submitted: {job_v1.name}") ``` @@ -510,7 +507,6 @@ job_v2 = evaluator.submit( metric=metric_v2_remote, dataset=dataset_ref, config=sample_config, - metric_bundle_packager=CloudpickleMetricBundlePackager(), ) print(f"Job submitted: {job_v2.name}") diff --git a/plugins/nemo-evaluator/openapi/openapi.yaml b/plugins/nemo-evaluator/openapi/openapi.yaml index 4b1f07209e..aa28f86ee9 100644 --- a/plugins/nemo-evaluator/openapi/openapi.yaml +++ b/plugins/nemo-evaluator/openapi/openapi.yaml @@ -1118,6 +1118,40 @@ components: description: Parameters for model inference. Extra fields can be supplied for additional options applied to the inference request directly. Fields not supported by the model may cause inference errors during evaluation. + InlineMetricPayload: + properties: + kind: + type: string + const: inline + title: Kind + description: Payload format discriminator. + metric: + additionalProperties: true + type: object + title: Metric + description: JSON-serialized built-in metric configuration, discriminated + by its own `type`. + digest: + title: Digest + description: SHA-256 digest of the canonical metric JSON. Informational; + recomputed server-side. + type: string + additionalProperties: false + type: object + required: + - kind + - metric + title: InlineMetricPayload + description: 'Wire schema for an inline (config-serialized) metric payload. + + + Mirrors the runtime ``InlineMetricPayload``. The metric is stored as its own + + JSON configuration and reconstructed from the metric type union at execution, + + so no code is shipped or executed on load. Used for platform-recognized + + built-in metric types.' Metric: properties: id: @@ -1272,12 +1306,14 @@ components: payload: oneOf: - $ref: '#/components/schemas/CloudpickleMetricPayload' + - $ref: '#/components/schemas/InlineMetricPayload' title: Payload description: Format-specific serialized metric. discriminator: propertyName: kind mapping: cloudpickle: '#/components/schemas/CloudpickleMetricPayload' + inline: '#/components/schemas/InlineMetricPayload' additionalProperties: false type: object required: diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py b/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py index 927a19224b..8f7f7e93cd 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py @@ -7,7 +7,7 @@ from datetime import datetime from enum import StrEnum -from typing import Annotated, Literal +from typing import Annotated, Any, Literal from nemo_evaluator.shared.metric_bundles.bundles import ( BundledMetricOutputSpec, @@ -40,9 +40,30 @@ class CloudpickleMetricPayload(BaseModel): ) +class InlineMetricPayload(BaseModel): + """Wire schema for an inline (config-serialized) metric payload. + + Mirrors the runtime ``InlineMetricPayload``. The metric is stored as its own + JSON configuration and reconstructed from the metric type union at execution, + so no code is shipped or executed on load. Used for platform-recognized + built-in metric types. + """ + + model_config = ConfigDict(extra="forbid") + + kind: Literal["inline"] = Field(description="Payload format discriminator.") + metric: dict[str, Any] = Field( + description="JSON-serialized built-in metric configuration, discriminated by its own `type`." + ) + digest: str | None = Field( + default=None, + description="SHA-256 digest of the canonical metric JSON. Informational; recomputed server-side.", + ) + + # Discriminated on ``kind`` so additional payload formats can join the union -# without changing the field type. Cloudpickle is the only kind today. -MetricPayload = Annotated[CloudpickleMetricPayload, Field(discriminator="kind")] +# without changing the field type. +MetricPayload = Annotated[CloudpickleMetricPayload | InlineMetricPayload, Field(discriminator="kind")] class MetricInline(BaseModel): diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py index 86a4842d1e..ed4c92441f 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py @@ -11,6 +11,10 @@ from pathlib import Path from typing import Annotated, Any, ClassVar, Self, TypeAlias +# Imported for their registration side effects: each module registers its +# payload kind in the bundle registry so MetricBundle payloads validate. +import nemo_evaluator.shared.metric_bundles.cloudpickle # noqa: F401 +import nemo_evaluator.shared.metric_bundles.inline # noqa: F401 from nemo_evaluator.api.schemas import MetricInline from nemo_evaluator.filesets import FilesetRef, download_dataset, download_dataset_sync from nemo_evaluator.metric_refs import MetricRef, resolve_metric_specs @@ -21,7 +25,6 @@ metric_bundle_packager_for_payload, unbundle_metric, ) -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricPayload # noqa: F401 from nemo_evaluator_sdk import Evaluator from nemo_evaluator_sdk.execution.config import resolve_params from nemo_evaluator_sdk.execution.metric_execution import run_sync diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/metric_storage.py b/plugins/nemo-evaluator/src/nemo_evaluator/metric_storage.py index b15baddf7d..5cbebd5432 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/metric_storage.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/metric_storage.py @@ -16,9 +16,10 @@ import logging import uuid -# Importing the cloudpickle module registers the "cloudpickle" payload kind in -# the bundle registry so MetricBundle payloads round-trip through validation. +# Importing the payload modules registers their bundle payload kinds in the +# bundle registry so MetricBundle payloads round-trip through validation. import nemo_evaluator.shared.metric_bundles.cloudpickle # noqa: F401 +import nemo_evaluator.shared.metric_bundles.inline # noqa: F401 from nemo_evaluator.shared.metric_bundles.bundles import MetricBundle from nemo_platform import AsyncNeMoPlatform from pydantic import ValidationError diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/_executor.py b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/_executor.py index c5da61daa4..e6b48a10d5 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/_executor.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/_executor.py @@ -23,8 +23,13 @@ ) from nemo_evaluator.sdk.types import PluginDatasetInput from nemo_evaluator.sdk.utils import filter_benchmark_result, filter_evaluation_result -from nemo_evaluator.shared.metric_bundles.bundles import MetricBundle, MetricBundlePackager, bundle_metric -from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundle, + MetricBundlePackager, + MetricBundlePackagerPolicyError, + bundle_metric, +) +from nemo_evaluator.shared.metric_bundles.defaults import resolve_default_metric_bundle_packager from nemo_evaluator_sdk.datasets.loader import prepare_dataset_rows from nemo_evaluator_sdk.execution.config import resolve_params from nemo_evaluator_sdk.execution.metric_execution import run_sync @@ -52,10 +57,6 @@ SubmitTargetSpec = TargetSpec | ModelRef -class MetricBundlePackagerPolicyError(RuntimeError): - """Raised when plugin backend metric packaging is not configured.""" - - def _require_metric_bundle_packager(metric_bundle_packager: MetricBundlePackager | None) -> MetricBundlePackager: if metric_bundle_packager is None: raise MetricBundlePackagerPolicyError( @@ -307,7 +308,9 @@ def evaluate( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), + metric_bundle_packager=resolve_default_metric_bundle_packager( + metric, None, allow_cloudpickle_fallback=True, action="Running" + ), ) payload = self.run_local( spec=spec, @@ -367,7 +370,9 @@ def evaluate_benchmark( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), + metric_bundle_packager=resolve_default_metric_bundle_packager( + metrics, None, allow_cloudpickle_fallback=True, action="Running" + ), ) payload = self.run_local( spec=spec, @@ -538,7 +543,9 @@ async def evaluate( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), + metric_bundle_packager=resolve_default_metric_bundle_packager( + metric, None, allow_cloudpickle_fallback=True, action="Running" + ), ) payload = await self.run_local( spec=spec, @@ -569,7 +576,9 @@ async def evaluate_benchmark( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=CloudpickleMetricBundlePackager(), + metric_bundle_packager=resolve_default_metric_bundle_packager( + metrics, None, allow_cloudpickle_fallback=True, action="Running" + ), ) payload = await self.run_local( spec=spec, diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/metric_resources.py b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/metric_resources.py index 4def0cf79e..7854e468c4 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/metric_resources.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/metric_resources.py @@ -20,6 +20,7 @@ MetricBundlePackager, bundle_metric, ) +from nemo_evaluator.shared.metric_bundles.defaults import resolve_default_metric_bundle_packager from nemo_evaluator_sdk.metrics.protocol import Metric as RuntimeMetric from nemo_platform import AsyncNeMoPlatform, NeMoPlatform from nemo_platform_plugin.schema import Page @@ -32,13 +33,11 @@ def _metric_inline( """Package a runtime metric (or accept a pre-built bundle) as the wire DTO.""" if isinstance(metric, MetricBundle): bundle = metric - elif metric_bundle_packager is None: - raise ValueError( - "metric_bundle_packager is required when storing a runtime metric; " - "pass CloudpickleMetricBundlePackager(), or pass a pre-built MetricBundle." - ) else: - bundle = bundle_metric(metric, metric_bundle_packager) + packager = resolve_default_metric_bundle_packager( + metric, metric_bundle_packager, allow_cloudpickle_fallback=False, action="Storing" + ) + bundle = bundle_metric(metric, packager) # JSON round-trip keeps the base64 payload encoding consistent with the runtime model. return MetricInline.model_validate_json(bundle.model_dump_json()) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/resources.py b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/resources.py index 2f210c4b90..fd4528071d 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/sdk/resources.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/sdk/resources.py @@ -30,6 +30,7 @@ RunConfigOnlineModel, ) from nemo_evaluator.shared.metric_bundles.bundles import MetricBundlePackager +from nemo_evaluator.shared.metric_bundles.defaults import resolve_default_metric_bundle_packager from nemo_evaluator_sdk.metrics.protocol import Metric from nemo_evaluator_sdk.values import ( Agent, @@ -135,11 +136,6 @@ def submit( metric_bundle_packager: MetricBundlePackager | None = None, ) -> EvaluatorJobResource: """Submit a metric job through the evaluator plugin executor.""" - if metric_bundle_packager is None: - raise ValueError( - "metric_bundle_packager is required for submit(); " - "pass CloudpickleMetricBundlePackager() to enable metric bundling." - ) return self._executor.submit( metric=metric, dataset=dataset, @@ -147,7 +143,9 @@ def submit( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=metric_bundle_packager, + metric_bundle_packager=resolve_default_metric_bundle_packager( + metric, metric_bundle_packager, allow_cloudpickle_fallback=False, action="Submitting" + ), ) @overload @@ -365,11 +363,6 @@ async def submit( metric_bundle_packager: MetricBundlePackager | None = None, ) -> AsyncEvaluatorJobResource: """Submit a metric job through the evaluator plugin executor.""" - if metric_bundle_packager is None: - raise ValueError( - "metric_bundle_packager is required for submit(); " - "pass CloudpickleMetricBundlePackager() to enable metric bundling." - ) return await self._executor.submit( metric=metric, dataset=dataset, @@ -377,7 +370,9 @@ async def submit( target=target, field_mapping=field_mapping, prompt_template=prompt_template, - metric_bundle_packager=metric_bundle_packager, + metric_bundle_packager=resolve_default_metric_bundle_packager( + metric, metric_bundle_packager, allow_cloudpickle_fallback=False, action="Submitting" + ), ) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/bundles.py b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/bundles.py index bef57ebe50..6d99dd6d73 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/bundles.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/bundles.py @@ -34,6 +34,10 @@ class MetricBundlingError(ValueError): """Raised when a metric cannot be bundled or hydrated.""" +class MetricBundlePackagerPolicyError(RuntimeError): + """Raised when metric packaging is not configured for an operation.""" + + class MetricMetadata(BaseModel): """User-facing metadata captured with a bundled metric.""" diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/defaults.py b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/defaults.py new file mode 100644 index 0000000000..49f7eb2a02 --- /dev/null +++ b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/defaults.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Default metric bundle packager selection. + +Encapsulates the policy for choosing a packager when the caller does not provide +one explicitly. Built-in metric types use the inline packager (config-serialized, +no code execution). Custom metrics fall back to cloudpickle for local execution, +or require an explicit cloudpickle opt-in for operations that ship the metric to +the service. +""" + +from __future__ import annotations + +from collections.abc import Sequence + +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundlePackager, + MetricBundlePackagerPolicyError, +) +from nemo_evaluator.shared.metric_bundles.hybrid import HybridMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager, inline_bundle_supported +from nemo_evaluator_sdk.metrics.protocol import Metric + + +def resolve_default_metric_bundle_packager( + metric: Metric | Sequence[Metric], + explicit: MetricBundlePackager | None, + *, + allow_cloudpickle_fallback: bool, + action: str, +) -> MetricBundlePackager: + """Resolve the packager to use for one or more metrics. + + An explicit packager is always honored. Otherwise the inline packager is used + when every metric is a built-in type. When a custom metric is present, local + execution (``allow_cloudpickle_fallback=True``) uses the hybrid packager so + built-in metrics still bundle inline and only the custom metric is + cloudpickled; operations that ship the metric to the service require an + explicit opt-in instead. + + Args: + metric: A runtime metric, or a sequence of them (one packager applies to all). + explicit: A caller-provided packager, if any. + allow_cloudpickle_fallback: Whether custom metrics may default to cloudpickle. + action: Verb describing the operation, used in the error message. + + Returns: + The packager to bundle the metric(s) with. + + Raises: + MetricBundlePackagerPolicyError: When a custom metric needs an explicit + packager and no fallback is allowed. + """ + if explicit is not None: + return explicit + metrics = metric if isinstance(metric, Sequence) and not isinstance(metric, (str, bytes)) else [metric] + if all(inline_bundle_supported(item) for item in metrics): + return InlineMetricBundlePackager() + if allow_cloudpickle_fallback: + return HybridMetricBundlePackager() + raise MetricBundlePackagerPolicyError( + f"{action} a custom metric requires an explicit metric_bundle_packager; " + "pass HybridMetricBundlePackager() (recommended — built-in metrics stay inline) " + "or CloudpickleMetricBundlePackager() to bundle the metric code." + ) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/hybrid.py b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/hybrid.py new file mode 100644 index 0000000000..0276c624c3 --- /dev/null +++ b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/hybrid.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Hybrid metric bundle packager. + +Packages each metric with the lightest representation that supports it: built-in +metric types are bundled inline (config-serialized, reconstructed from the metric +type union), and only metrics that cannot be reconstructed from configuration are +cloudpickled. This minimizes cloudpickle usage so built-in metrics avoid the +Python-version coupling of cloudpickle payloads — relevant when a remote service +runs a different interpreter than the submitter. +""" + +from __future__ import annotations + +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundlePackager, + MetricBundlePayload, + metric_bundle_packager_for_payload, +) +from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager, inline_bundle_supported +from nemo_evaluator_sdk.metrics.protocol import Metric + + +class HybridMetricBundlePackager(MetricBundlePackager): + """Inline built-in metrics; cloudpickle only metrics that require it. + + Applied per metric, so a mixed set bundles each metric independently: inline + where the metric type is reconstructable, cloudpickle otherwise. Loading is + dispatched by the stored payload kind, so hydration works regardless of which + representation each metric used. + """ + + def __init__(self) -> None: + """Build the delegate inline and cloudpickle packagers.""" + self._inline = InlineMetricBundlePackager() + self._cloudpickle = CloudpickleMetricBundlePackager() + + def package(self, metric: Metric) -> MetricBundlePayload: + """Inline the metric when its type is reconstructable; cloudpickle otherwise.""" + if inline_bundle_supported(metric): + return self._inline.package(metric) + return self._cloudpickle.package(metric) + + def load(self, payload: MetricBundlePayload) -> Metric: + """Hydrate by dispatching to the packager registered for the payload kind.""" + return metric_bundle_packager_for_payload(payload).load(payload) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/inline.py b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/inline.py new file mode 100644 index 0000000000..af395321c6 --- /dev/null +++ b/plugins/nemo-evaluator/src/nemo_evaluator/shared/metric_bundles/inline.py @@ -0,0 +1,107 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Inline metric bundle implementation. + +The inline packager stores a built-in metric as its own JSON configuration +instead of a serialized code blob. The runtime reconstructs the metric from the +``MetricsUnion`` discriminated union (keyed on the metric ``type``), so no +arbitrary code is executed on load. This is the default bundler for metric types +the platform already recognizes; custom metric classes that are not part of +``MetricsUnion`` cannot be reconstructed from config and require the +``CloudpickleMetricBundlePackager`` instead. +""" + +from __future__ import annotations + +import hashlib +import json +from typing import Any, Literal, get_args + +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundlePackager, + MetricBundlePayload, + MetricBundlingError, + register_metric_bundle_kind, +) +from nemo_evaluator_sdk.metrics.protocol import Metric +from nemo_evaluator_sdk.metrics.types import MetricsUnion, MetricVariants +from pydantic import ConfigDict, TypeAdapter, computed_field, field_validator + +# Discriminated union (keyed on ``type``) used to serialize and reconstruct the +# concrete built-in metric. Reconstruction is pure data validation — no code is +# executed — so inline bundles are safe to hydrate. +_METRIC_ADAPTER: TypeAdapter[Any] = TypeAdapter(MetricsUnion) + +# Concrete metric classes that participate in ``MetricsUnion``. A metric must be +# an instance of one of these to be inline-bundleable. +_INLINE_SUPPORTED_TYPES: tuple[type, ...] = tuple(get_args(MetricVariants)) + + +def inline_bundle_supported(metric: object) -> bool: + """Return whether a metric can be bundled inline (reconstructed from config).""" + return isinstance(metric, _INLINE_SUPPORTED_TYPES) + + +class InlineMetricPayload(MetricBundlePayload): + """Payload storing a built-in metric as its JSON configuration.""" + + model_config = ConfigDict(extra="ignore") + + metric: dict[str, Any] + + @field_validator("metric") + @classmethod + def _metric_must_declare_type(cls, value: dict[str, Any]) -> dict[str, Any]: + metric_type = value.get("type") + if not isinstance(metric_type, str) or not metric_type: + raise MetricBundlingError("inline metric payload must include a non-empty 'type'") + return value + + @property + def kind(self) -> Literal["inline"]: + """Payload discriminator used by the metric bundle registry.""" + return "inline" + + @computed_field + @property + def digest(self) -> str: + """Digest of the canonical serialized metric configuration.""" + canonical = json.dumps(self.metric, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() + + +class InlineMetricBundlePackager(MetricBundlePackager): + """Inline metric bundle packager. + + Serializes a built-in metric as JSON and reconstructs it from the metric + type union on load. No arbitrary code is executed when hydrating, so this is + the preferred default for platform-recognized metric types. + """ + + def package(self, metric: Metric) -> MetricBundlePayload: + """Package a built-in metric object as its JSON configuration.""" + if not isinstance(metric, Metric): + raise MetricBundlingError("object does not satisfy the Metric protocol") + if not inline_bundle_supported(metric): + raise MetricBundlingError( + "inline metric bundling supports only built-in metric types; " + "pass CloudpickleMetricBundlePackager() to bundle a custom metric." + ) + data: dict[str, Any] = _METRIC_ADAPTER.dump_python(metric, mode="json") + return InlineMetricPayload(metric=data) + + def load(self, payload: MetricBundlePayload) -> Metric: + """Hydrate a metric from an inline payload by validating its configuration.""" + inline_payload = InlineMetricPayload.model_validate(payload.model_dump(mode="python")) + hydrated_metric = _METRIC_ADAPTER.validate_python(inline_payload.metric) + if not isinstance(hydrated_metric, Metric): + raise MetricBundlingError("unbundled object does not satisfy the Metric protocol") + return hydrated_metric + + +register_metric_bundle_kind( + "inline", + payload_type=InlineMetricPayload, + packager_factory=InlineMetricBundlePackager, +) diff --git a/plugins/nemo-evaluator/tests/sdk/test_metric_sdk_resources.py b/plugins/nemo-evaluator/tests/sdk/test_metric_sdk_resources.py index 9e3c764d55..75115e0934 100644 --- a/plugins/nemo-evaluator/tests/sdk/test_metric_sdk_resources.py +++ b/plugins/nemo-evaluator/tests/sdk/test_metric_sdk_resources.py @@ -6,7 +6,7 @@ from __future__ import annotations from datetime import datetime, timezone -from typing import Any +from typing import Any, cast from unittest.mock import AsyncMock, MagicMock import pytest @@ -15,9 +15,30 @@ AsyncEvaluatorMetricsResource, EvaluatorMetricsResource, ) -from nemo_evaluator.shared.metric_bundles.bundles import MetricBundle, bundle_metric +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundle, + MetricBundlePackagerPolicyError, + bundle_metric, +) from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.protocol import Metric as RuntimeMetric +from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class _CustomRuntimeMetric: + """A protocol-satisfying metric that is not inline-bundleable.""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) def _bundle() -> MetricBundle: @@ -81,12 +102,24 @@ def test_sync_create_posts_bundle_and_returns_metric() -> None: assert body["payload"]["kind"] == "cloudpickle" -def test_sync_create_requires_packager_for_runtime_metric() -> None: - resource = EvaluatorMetricsResource(_platform(MagicMock())) +def test_sync_create_defaults_to_inline_for_builtin_metric() -> None: + bundle = _bundle() + http_client = MagicMock() + http_client.post.return_value = _response(_metric_response("exact", bundle)) + resource = EvaluatorMetricsResource(_platform(http_client)) metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") - with pytest.raises(ValueError, match="metric_bundle_packager is required"): - resource.create("exact", metric=metric) + resource.create("exact", metric=metric) + + body = http_client.post.call_args.kwargs["json"] + assert body["payload"]["kind"] == "inline" + + +def test_sync_create_requires_explicit_packager_for_custom_metric() -> None: + resource = EvaluatorMetricsResource(_platform(MagicMock())) + + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): + resource.create("custom", metric=cast(RuntimeMetric, _CustomRuntimeMetric())) def test_sync_retrieve_targets_item_url() -> None: diff --git a/plugins/nemo-evaluator/tests/shared/metric_bundles/test_defaults.py b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_defaults.py new file mode 100644 index 0000000000..378935aba7 --- /dev/null +++ b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_defaults.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import cast + +import pytest +from nemo_evaluator.shared.metric_bundles.bundles import MetricBundlePackagerPolicyError +from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.defaults import resolve_default_metric_bundle_packager +from nemo_evaluator.shared.metric_bundles.hybrid import HybridMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager +from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class _CustomMetric: + """A protocol-satisfying metric that is not part of MetricsUnion (not inline-bundleable).""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + +def _builtin() -> Metric: + return ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") + + +def test_defaults_inline_for_builtin_and_raises_for_custom_submit() -> None: + assert isinstance( + resolve_default_metric_bundle_packager(_builtin(), None, allow_cloudpickle_fallback=False, action="Submitting"), + InlineMetricBundlePackager, + ) + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): + resolve_default_metric_bundle_packager( + cast(Metric, _CustomMetric()), None, allow_cloudpickle_fallback=False, action="Submitting" + ) + + +def test_uses_hybrid_for_custom_local_run() -> None: + assert isinstance( + resolve_default_metric_bundle_packager( + cast(Metric, _CustomMetric()), None, allow_cloudpickle_fallback=True, action="Running" + ), + HybridMetricBundlePackager, + ) + + +def test_uses_hybrid_for_mixed_local_run_but_inline_when_all_builtin() -> None: + mixed = [_builtin(), cast(Metric, _CustomMetric())] + assert isinstance( + resolve_default_metric_bundle_packager(mixed, None, allow_cloudpickle_fallback=True, action="Running"), + HybridMetricBundlePackager, + ) + assert isinstance( + resolve_default_metric_bundle_packager( + [_builtin()], None, allow_cloudpickle_fallback=False, action="Submitting" + ), + InlineMetricBundlePackager, + ) + + +def test_honors_explicit_packager() -> None: + explicit = CloudpickleMetricBundlePackager() + assert ( + resolve_default_metric_bundle_packager( + _builtin(), explicit, allow_cloudpickle_fallback=False, action="Submitting" + ) + is explicit + ) diff --git a/plugins/nemo-evaluator/tests/shared/metric_bundles/test_hybrid.py b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_hybrid.py new file mode 100644 index 0000000000..f483e99e54 --- /dev/null +++ b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_hybrid.py @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import cast + +from nemo_evaluator.shared.metric_bundles.bundles import MetricBundle, bundle_metric, unbundle_metric +from nemo_evaluator.shared.metric_bundles.hybrid import HybridMetricBundlePackager +from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class _CustomMetric: + """A protocol-satisfying metric that is not inline-bundleable (module-level so it cloudpickles).""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + +def _roundtrip(bundle: MetricBundle) -> Metric: + return unbundle_metric(MetricBundle.model_validate_json(bundle.model_dump_json())) + + +def test_hybrid_inlines_builtin_metric() -> None: + metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") + + bundle = bundle_metric(metric, HybridMetricBundlePackager()) + + assert bundle.payload.kind == "inline" + assert type(_roundtrip(bundle)) is ExactMatchMetric + + +def test_hybrid_cloudpickles_custom_metric() -> None: + bundle = bundle_metric(cast(Metric, _CustomMetric()), HybridMetricBundlePackager()) + + assert bundle.payload.kind == "cloudpickle" + assert isinstance(_roundtrip(bundle), _CustomMetric) + + +def test_hybrid_routes_each_metric_independently() -> None: + """A mixed set bundles each metric with the lightest representation that supports it.""" + packager = HybridMetricBundlePackager() + metrics: list[Metric] = [ + ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + cast(Metric, _CustomMetric()), + ] + + kinds = [bundle_metric(metric, packager).payload.kind for metric in metrics] + + assert kinds == ["inline", "cloudpickle"] + + +def test_hybrid_load_dispatches_by_payload_kind() -> None: + packager = HybridMetricBundlePackager() + inline_bundle = bundle_metric( + ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), packager + ) + cloudpickle_bundle = bundle_metric(cast(Metric, _CustomMetric()), packager) + + assert type(packager.load(inline_bundle.payload)) is ExactMatchMetric + assert isinstance(packager.load(cloudpickle_bundle.payload), _CustomMetric) diff --git a/plugins/nemo-evaluator/tests/shared/metric_bundles/test_inline.py b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_inline.py new file mode 100644 index 0000000000..c31043ef58 --- /dev/null +++ b/plugins/nemo-evaluator/tests/shared/metric_bundles/test_inline.py @@ -0,0 +1,208 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import hashlib +import json +from collections.abc import Sequence +from typing import cast + +import pytest +from nemo_evaluator.api.schemas import MetricInline +from nemo_evaluator.shared.metric_bundles.bundles import ( + MetricBundle, + MetricBundlingError, + bundle_metric, + unbundle_metric, +) +from nemo_evaluator.shared.metric_bundles.inline import ( + InlineMetricBundlePackager, + InlineMetricPayload, + inline_bundle_supported, +) +from nemo_evaluator_sdk.enums import ModelFormat +from nemo_evaluator_sdk.metrics.bleu import BLEUMetric +from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.f1 import F1Metric +from nemo_evaluator_sdk.metrics.llm_judge import LLMJudgeMetric +from nemo_evaluator_sdk.metrics.number_check import NumberCheckMetric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult +from nemo_evaluator_sdk.metrics.ragas import ( + AgentGoalAccuracyMetric, + AnswerAccuracyMetric, + ContextEntityRecallMetric, + ContextPrecisionMetric, + ContextRecallMetric, + ContextRelevanceMetric, + FaithfulnessMetric, + NoiseSensitivityMetric, + ResponseGroundednessMetric, + ResponseRelevancyMetric, + ToolCallAccuracyMetric, + TopicAdherenceMetric, +) +from nemo_evaluator_sdk.metrics.remote import NemoAgentToolkitRemoteMetric, RemoteMetric +from nemo_evaluator_sdk.metrics.rouge import ROUGEMetric +from nemo_evaluator_sdk.metrics.string_check import StringCheckMetric +from nemo_evaluator_sdk.metrics.tool_calling import ToolCallingMetric +from nemo_evaluator_sdk.values import Model, SecretRef +from nemo_evaluator_sdk.values.scores import JSONScoreParser, RangeScore, RemoteScore +from pydantic import BaseModel + + +class _CustomMetric: + """A protocol-satisfying metric that is not part of MetricsUnion.""" + + type = "custom-score" + description = "custom metric" + labels = {"source": "test"} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + +def _judge_model() -> Model: + return Model(url="https://judge.example.test/v1/chat/completions", name="judge-model", format=ModelFormat.OPEN_AI) + + +def _embeddings_model() -> Model: + return Model(url="https://judge.example.test/v1/embeddings", name="embedding-model", format=ModelFormat.OPEN_AI) + + +def _builtin_metric_cases() -> Sequence[tuple[str, Metric]]: + judge_model = _judge_model() + return [ + ("exact_match", ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}")), + ("f1", F1Metric(reference="{{item.expected}}", candidate="{{item.output}}")), + ("bleu", BLEUMetric(references=["{{item.expected}}"], candidate="{{item.output}}")), + ("rouge", ROUGEMetric(reference="{{item.expected}}", candidate="{{item.output}}")), + ( + "string_check", + StringCheckMetric( + operation="contains", left_template="{{item.output}}", right_template="{{item.expected}}" + ), + ), + ( + "number_check", + NumberCheckMetric(operation="equals", left_template="{{item.left}}", right_template="{{item.right}}"), + ), + ("tool_calling", ToolCallingMetric(reference="{{item.expected_tool_calls}}")), + ( + "llm_judge", + LLMJudgeMetric( + model=judge_model, + scores=[ + RangeScore( + name="helpfulness", minimum=1, maximum=5, parser=JSONScoreParser(json_path="helpfulness") + ) + ], + prompt_template="Judge: {{item.expected}} -> {{item.output}}", + ), + ), + ( + "remote", + RemoteMetric( + url="https://remote.example.test", + body={"prompt": "{{item.prompt}}"}, + scores=[RemoteScore(name="quality", parser=JSONScoreParser(json_path="$.result.quality"))], + ), + ), + ( + "nemo_agent_toolkit_remote", + NemoAgentToolkitRemoteMetric(url="https://remote.example.test", evaluator_name="nat-quality"), + ), + ("topic_adherence", TopicAdherenceMetric(metric_mode="f1", judge_model=judge_model)), + ("tool_call_accuracy", ToolCallAccuracyMetric()), + ("agent_goal_accuracy", AgentGoalAccuracyMetric(judge_model=judge_model)), + ("answer_accuracy", AnswerAccuracyMetric(judge_model=judge_model)), + ("context_relevance", ContextRelevanceMetric(judge_model=judge_model)), + ("response_groundedness", ResponseGroundednessMetric(judge_model=judge_model)), + ("context_recall", ContextRecallMetric(judge_model=judge_model)), + ("context_precision", ContextPrecisionMetric(judge_model=judge_model)), + ("context_entity_recall", ContextEntityRecallMetric(judge_model=judge_model)), + ("response_relevancy", ResponseRelevancyMetric(judge_model=judge_model, embeddings_model=_embeddings_model())), + ("faithfulness", FaithfulnessMetric(judge_model=judge_model)), + ("noise_sensitivity", NoiseSensitivityMetric(judge_model=judge_model)), + ] + + +_CASES = _builtin_metric_cases() +_CASE_IDS = [name for name, _ in _CASES] + + +@pytest.mark.parametrize(("case_name", "metric"), _CASES, ids=_CASE_IDS) +def test_inline_packager_round_trips_every_builtin_metric(case_name: str, metric: Metric) -> None: + """Every built-in metric serializes inline and reconstructs to an identical object.""" + bundle = bundle_metric(metric, InlineMetricBundlePackager()) + + # Full wire round-trip: runtime bundle -> JSON -> runtime bundle. + restored = MetricBundle.model_validate_json(bundle.model_dump_json()) + hydrated = unbundle_metric(restored) + + assert restored.payload.kind == "inline", case_name + assert restored.metric_type == metric.type, case_name + assert type(hydrated) is type(metric), case_name + # Inline reconstruction must be lossless at the config level (not just the type). + assert cast(BaseModel, hydrated).model_dump() == cast(BaseModel, metric).model_dump(), case_name + assert [o.name for o in hydrated.output_spec()] == [o.name for o in metric.output_spec()], case_name + + +@pytest.mark.parametrize(("case_name", "metric"), _CASES, ids=_CASE_IDS) +def test_inline_payload_passes_through_wire_dto(case_name: str, metric: Metric) -> None: + """The inline payload survives the MetricInline wire DTO (OpenAPI contract).""" + bundle = bundle_metric(metric, InlineMetricBundlePackager()) + + wire = MetricInline.model_validate_json(bundle.model_dump_json()) + + assert wire.payload.kind == "inline", case_name + assert wire.metric_type + assert wire.outputs + # Re-validating the wire DTO JSON back into a runtime bundle must still hydrate. + runtime_again = MetricBundle.model_validate_json(wire.model_dump_json()) + assert type(unbundle_metric(runtime_again)) is type(metric), case_name + + +def test_inline_payload_digest_is_canonical_sha256() -> None: + metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") + bundle = bundle_metric(metric, InlineMetricBundlePackager()) + payload = InlineMetricPayload.model_validate(bundle.payload.model_dump(mode="python")) + + expected = hashlib.sha256( + json.dumps(payload.metric, sort_keys=True, separators=(",", ":")).encode("utf-8") + ).hexdigest() + assert payload.digest == expected + serialized = cast(dict[str, object], bundle.model_dump(mode="json")["payload"]) + assert serialized["kind"] == "inline" + assert serialized["digest"] == expected + + +def test_inline_packager_rejects_custom_metric() -> None: + with pytest.raises(MetricBundlingError, match="CloudpickleMetricBundlePackager"): + bundle_metric(cast(Metric, _CustomMetric()), InlineMetricBundlePackager()) + + +def test_inline_bundle_supported_classifies_metrics() -> None: + assert inline_bundle_supported(ExactMatchMetric(reference="{{item.expected}}")) + assert not inline_bundle_supported(cast(Metric, _CustomMetric())) + + +def test_inline_captures_metric_secrets() -> None: + metric = LLMJudgeMetric( + model=Model( + url="https://judge.example.test/v1/chat/completions", + name="judge-model", + api_key_secret=SecretRef(root="judge-secret"), + format=ModelFormat.OPEN_AI, + ), + scores=[RangeScore(name="helpfulness", minimum=1, maximum=5, parser=JSONScoreParser(json_path="helpfulness"))], + ) + + bundle = bundle_metric(metric, InlineMetricBundlePackager()) + restored = MetricBundle.model_validate_json(bundle.model_dump_json()) + + assert restored.secrets == {"judge_secret": SecretRef(root="judge-secret")} diff --git a/plugins/nemo-evaluator/tests/test_inline_bundle_execution.py b/plugins/nemo-evaluator/tests/test_inline_bundle_execution.py new file mode 100644 index 0000000000..2ac18a4c27 --- /dev/null +++ b/plugins/nemo-evaluator/tests/test_inline_bundle_execution.py @@ -0,0 +1,163 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""End-to-end execution tests for inline-bundled metrics. + +These tests run real metric scoring — no mocks. The end-to-end cases drive the +full evaluator job (`EvaluateJob`) through the local scheduler, exercising the +complete inline path: bundle -> MetricInline wire DTO -> job spec -> unbundle +(reconstruct from config) -> execute -> aggregate scores. The reconstruction +tests round-trip each metric through the bundle and then actually invoke the +hydrated metric's `compute_scores`. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, cast + +import pytest +from nemo_evaluator.jobs.evaluate import EvaluateJob +from nemo_evaluator.shared.metric_bundles.bundles import bundle_metric, unbundle_metric +from nemo_evaluator.shared.metric_bundles.hybrid import HybridMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager +from nemo_evaluator_sdk.execution.samples import build_metric_input +from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric +from nemo_evaluator_sdk.metrics.f1 import F1Metric +from nemo_evaluator_sdk.metrics.number_check import NumberCheckMetric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult +from nemo_evaluator_sdk.metrics.string_check import StringCheckMetric +from nemo_platform_plugin.scheduler import NemoJobScheduler + + +class _CustomConstantMetric: + """Module-level custom metric (cloudpicklable) that always scores 1.0.""" + + type = "custom-constant" + description = "custom constant metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("constant")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="constant", value=1.0)]) + + +def _inline_payload(metric: Metric) -> dict[str, Any]: + """Bundle a metric inline and project it to the job-spec wire shape.""" + return bundle_metric(metric, InlineMetricBundlePackager()).model_dump(mode="json") + + +def _load_artifact_payload(run_result: dict[str, Any]) -> dict[str, Any]: + artifact_path = Path(run_result["artifact"]["artifact_url"].removeprefix("file://")) + return cast(dict[str, Any], json.loads(artifact_path.read_text(encoding="utf-8"))) + + +def _aggregate_scores(run_result: dict[str, Any]) -> list[dict[str, Any]]: + return cast(list[dict[str, Any]], _load_artifact_payload(run_result)["aggregate_scores"]["scores"]) + + +def test_evaluate_job_runs_inline_bundled_exact_match_metric() -> None: + """Full job run with an inline-bundled metric produces real aggregate scores.""" + spec = { + "metrics": [ + _inline_payload(ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.model_output}}")) + ], + "dataset": [ + {"expected": "blue", "model_output": "Blue"}, # normalizes equal -> 1.0 + {"expected": "Jupiter", "model_output": "Saturn"}, # -> 0.0 + ], + "params": {"parallelism": 2}, + } + + result = NemoJobScheduler().run_local(EvaluateJob, spec) + + scores = _aggregate_scores(result) + assert scores[0]["name"] == "exact-match.exact-match" + assert scores[0]["mean"] == 0.5 + + +def test_evaluate_job_runs_multiple_inline_metrics() -> None: + """Multiple inline-bundled metrics in one job each execute and aggregate.""" + spec = { + "metrics": [ + _inline_payload(ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.model_output}}")), + _inline_payload( + StringCheckMetric( + operation="contains", + left_template="{{item.model_output}}", + right_template="{{item.expected}}", + ) + ), + ], + "dataset": [ + {"expected": "Paris", "model_output": "Paris"}, + {"expected": "Paris", "model_output": "London"}, + ], + "params": {"parallelism": 2}, + } + + result = NemoJobScheduler().run_local(EvaluateJob, spec) + + by_name = {score["name"]: score for score in _aggregate_scores(result)} + assert by_name["exact-match.exact-match"]["mean"] == 0.5 + assert by_name["string-check.string-check"]["mean"] == 0.5 + + +def test_evaluate_job_runs_hybrid_bundled_mixed_metrics() -> None: + """Hybrid bundling: built-in goes inline, custom is cloudpickled, and both execute in one job.""" + packager = HybridMetricBundlePackager() + builtin_payload = bundle_metric( + ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.model_output}}"), packager + ).model_dump(mode="json") + custom_payload = bundle_metric(cast(Metric, _CustomConstantMetric()), packager).model_dump(mode="json") + + # The built-in stays inline (no Python-version coupling); only the custom metric is cloudpickled. + assert builtin_payload["payload"]["kind"] == "inline" + assert custom_payload["payload"]["kind"] == "cloudpickle" + + spec = { + "metrics": [builtin_payload, custom_payload], + "dataset": [ + {"expected": "blue", "model_output": "Blue"}, # exact-match -> 1.0 + {"expected": "Jupiter", "model_output": "Saturn"}, # exact-match -> 0.0 + ], + "params": {"parallelism": 2}, + } + + result = NemoJobScheduler().run_local(EvaluateJob, spec) + + by_name = {score["name"]: score for score in _aggregate_scores(result)} + assert by_name["exact-match.exact-match"]["mean"] == 0.5 + assert by_name["custom-constant.constant"]["mean"] == 1.0 + + +@pytest.mark.asyncio +async def test_round_tripped_deterministic_metrics_execute_identically() -> None: + """After an inline bundle round-trip, hydrated metrics score identically to the originals.""" + item = {"expected": "the answer is 42", "left": "42", "right": "42.0"} + sample = {"output_text": "The answer is 42!"} + + metrics: list[Metric] = [ + ExactMatchMetric(reference="{{item.expected}}", candidate="{{sample.output_text}}"), + F1Metric(reference="{{item.expected}}", candidate="{{sample.output_text}}"), + NumberCheckMetric(operation="equals", left_template="{{item.left}}", right_template="{{item.right}}"), + StringCheckMetric( + operation="contains", left_template="{{sample.output_text}}", right_template="{{item.expected}}" + ), + ] + + for metric in metrics: + hydrated = unbundle_metric(bundle_metric(metric, InlineMetricBundlePackager())) + + original_result = await metric.compute_scores(build_metric_input(item, sample)) + hydrated_result = await hydrated.compute_scores(build_metric_input(item, sample)) + + original_values = [(o.name, o.value) for o in original_result.outputs] + hydrated_values = [(o.name, o.value) for o in hydrated_result.outputs] + assert hydrated_values == original_values, type(metric).__name__ + # Sanity: the deterministic scorers actually produced a score. + assert hydrated_result.outputs, type(metric).__name__ diff --git a/plugins/nemo-evaluator/tests/test_sdk.py b/plugins/nemo-evaluator/tests/test_sdk.py index b83a42d3d1..6b3f90bd3b 100644 --- a/plugins/nemo-evaluator/tests/test_sdk.py +++ b/plugins/nemo-evaluator/tests/test_sdk.py @@ -31,8 +31,9 @@ bundle_metric, ) from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager +from nemo_evaluator.shared.metric_bundles.inline import InlineMetricBundlePackager from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric -from nemo_evaluator_sdk.metrics.protocol import Metric +from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult from nemo_evaluator_sdk.values import FieldMapping, Model, ModelRef, RunConfig, RunConfigOnline, RunConfigOnlineModel from nemo_evaluator_sdk.values.results import AggregatedMetricResult, EvaluationResult from nemo_platform import AsyncNeMoPlatform, NeMoPlatform @@ -98,6 +99,21 @@ def load(self, payload: MetricBundlePayload) -> Metric: raise NotImplementedError("test packager only exercises submission-side packaging") +class _CustomRuntimeMetric: + """A protocol-satisfying metric that is not part of MetricsUnion (not inline-bundleable).""" + + type = "custom-score" + description = "custom metric" + labels: dict[str, str] = {} + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("score")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + del input + return MetricResult(outputs=[MetricOutput(name="score", value=1.0)]) + + class _SyncPlatform: def __init__(self) -> None: self.base_url = "http://test:8000" @@ -215,6 +231,29 @@ def test_build_evaluate_spec_requires_metric_bundle_packager() -> None: ) +def test_local_run_allows_cloudpickle_fallback_for_custom_metric(mocker: MockerFixture) -> None: + """Local run() executes in the caller's process, so custom metrics fall back to cloudpickle. + + The fallback is enabled only for local execution; remote submit/create still require an + explicit cloudpickle opt-in (covered separately). + """ + import nemo_evaluator.sdk._executor as executor_module + + spy = mocker.spy(executor_module, "resolve_default_metric_bundle_packager") + resource = Evaluator(cast(NeMoPlatform, _SyncPlatform())) + # Short-circuit after packaging so we don't drive the local job runtime. + mocker.patch.object(resource._executor, "run_local", side_effect=RuntimeError("stop after packaging")) + + with pytest.raises(RuntimeError, match="stop after packaging"): + resource.run( + metric=cast(Metric, _CustomRuntimeMetric()), + dataset=[{"expected": "a", "output": "a"}], + ) + + # No MetricBundlePackagerPolicyError: the custom metric was bundled (via cloudpickle) and reached execution. + assert spy.call_args.kwargs["allow_cloudpickle_fallback"] is True + + def test_build_evaluate_spec_includes_target_and_prompt_template() -> None: """Online evaluator specs should preserve model targets and prompt templates.""" model = Model(url="https://model.test/v1", name="model-a") @@ -590,13 +629,27 @@ def test_accepts_model_ref_target(self, mocker: MockerFixture) -> None: metric_bundle_packager=packager, ) - def test_requires_metric_bundle_packager(self) -> None: - """Submit should fail fast before delegating without a remote metric packager.""" + def test_defaults_to_inline_packager_for_builtin_metric(self, mocker: MockerFixture) -> None: + """Submit of a built-in metric without an explicit packager defaults to inline bundling.""" resource = Evaluator(cast(NeMoPlatform, _SyncPlatform())) + expected_job = mocker.Mock(spec=EvaluatorJobResource) + submit = mocker.patch.object(resource._executor, "submit", return_value=expected_job) - with pytest.raises(ValueError, match="metric_bundle_packager is required"): + job = resource.submit( + metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + dataset=[{"expected": "a", "output": "a"}], + ) + + assert job is expected_job + assert isinstance(submit.call_args.kwargs["metric_bundle_packager"], InlineMetricBundlePackager) + + def test_requires_explicit_packager_for_custom_metric(self) -> None: + """Submit of a custom metric requires an explicit cloudpickle opt-in.""" + resource = Evaluator(cast(NeMoPlatform, _SyncPlatform())) + + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): resource.submit( - metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + metric=cast(Metric, _CustomRuntimeMetric()), dataset=[{"expected": "a", "output": "a"}], ) @@ -1071,13 +1124,28 @@ async def test_accepts_model_ref_target(self, mocker: MockerFixture) -> None: ) @pytest.mark.asyncio - async def test_requires_metric_bundle_packager(self) -> None: - """Submit should fail fast before delegating without a remote metric packager.""" + async def test_defaults_to_inline_packager_for_builtin_metric(self, mocker: MockerFixture) -> None: + """Async submit of a built-in metric defaults to inline bundling.""" + resource = AsyncEvaluator(cast(AsyncNeMoPlatform, _AsyncPlatform())) + expected_job = mocker.Mock(spec=AsyncEvaluatorJobResource) + submit = mocker.patch.object(resource._executor, "submit", new=AsyncMock(return_value=expected_job)) + + job = await resource.submit( + metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + dataset=[{"expected": "a", "output": "a"}], + ) + + assert job is expected_job + assert isinstance(submit.call_args.kwargs["metric_bundle_packager"], InlineMetricBundlePackager) + + @pytest.mark.asyncio + async def test_requires_explicit_packager_for_custom_metric(self) -> None: + """Async submit of a custom metric requires an explicit cloudpickle opt-in.""" resource = AsyncEvaluator(cast(AsyncNeMoPlatform, _AsyncPlatform())) - with pytest.raises(ValueError, match="metric_bundle_packager is required"): + with pytest.raises(MetricBundlePackagerPolicyError, match="CloudpickleMetricBundlePackager"): await resource.submit( - metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}"), + metric=cast(Metric, _CustomRuntimeMetric()), dataset=[{"expected": "a", "output": "a"}], ) From 4c7c59825f6161c44df40a47e546dfbaf6def440 Mon Sep 17 00:00:00 2001 From: Sandy Chapman Date: Wed, 24 Jun 2026 13:44:13 -0300 Subject: [PATCH 2/2] fix(evaluator): address PR #438 review feedback - api/schemas.py: require a non-empty `type` on the inline wire payload so malformed inline metrics are rejected at the API boundary instead of failing at execution (mirrors the runtime InlineMetricPayload validator). The metric body stays an open object; concrete shape is still validated on hydration. - docs/evaluator/test_doc_examples.py: stub the executor with a sentinel in the built-in-submit test instead of `except Exception: pass`, so the test proves packaging resolved (no packager required) without swallowing unrelated errors. - docs/evaluator/metrics/llm-as-a-judge.mdx: replace the `{{ end_of_text }}` Fern template token with a literal `` per docs guidelines. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Sandy Chapman --- docs/evaluator/metrics/llm-as-a-judge.mdx | 2 +- docs/evaluator/test_doc_examples.py | 20 +++++++++---------- .../src/nemo_evaluator/api/schemas.py | 16 ++++++++++++++- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/docs/evaluator/metrics/llm-as-a-judge.mdx b/docs/evaluator/metrics/llm-as-a-judge.mdx index 886be51df2..abb357affc 100644 --- a/docs/evaluator/metrics/llm-as-a-judge.mdx +++ b/docs/evaluator/metrics/llm-as-a-judge.mdx @@ -645,7 +645,7 @@ prompt_template = { "temperature": 0.1, # Lower for more consistent scoring "max_tokens": 1024, # Increase if judge needs more space "timeout": 30, # Request timeout in seconds - "stop": ["<{{ end_of_text }}>"], # Stop sequences + "stop": [""], # Stop sequences } ``` diff --git a/docs/evaluator/test_doc_examples.py b/docs/evaluator/test_doc_examples.py index 59e05fcdd3..6126ee3601 100644 --- a/docs/evaluator/test_doc_examples.py +++ b/docs/evaluator/test_doc_examples.py @@ -105,23 +105,23 @@ def test_packager_param_is_submit_only() -> None: def test_builtin_submit_does_not_require_a_packager() -> None: """Built-in metrics bundle inline, so docs omit the packager on ``submit()``. - Submitting reaches the executor (which then needs a live service); the point - is only that no packager-policy error is raised for a built-in metric. + Packager resolution happens before delegating to the executor, so we stub the + executor with a sentinel: reaching it (rather than raising a packager-policy + error) proves the built-in metric bundled inline with no packager required — + without depending on a live service or swallowing unrelated failures. """ + from unittest.mock import patch + from nemo_evaluator_sdk import ExactMatchMetric evaluator = _evaluator() metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}") dataset = [{"expected": "Paris", "output": "Paris"}] + sentinel = RuntimeError("reached executor.submit (packaging resolved without a packager)") - try: - evaluator.submit(metric=metric, dataset=dataset) - except MetricBundlePackagerPolicyError as error: # pragma: no cover - regression guard - pytest.fail(f"built-in submit should not require a packager: {error}") - except Exception: - # Any other failure (e.g. connection refused with no live service) is fine; - # it means the built-in metric was bundled inline and reached execution. - pass + with patch.object(evaluator._executor, "submit", side_effect=sentinel): + with pytest.raises(RuntimeError, match="reached executor.submit"): + evaluator.submit(metric=metric, dataset=dataset) def test_custom_submit_requires_an_explicit_packager() -> None: diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py b/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py index 8f7f7e93cd..b90ee9097e 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py @@ -15,7 +15,7 @@ ) from nemo_evaluator_sdk.values.common import SecretRef from nemo_platform_plugin.schema import DatetimeFilter, Filter -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, field_validator class CloudpickleMetricPayload(BaseModel): @@ -60,6 +60,20 @@ class InlineMetricPayload(BaseModel): description="SHA-256 digest of the canonical metric JSON. Informational; recomputed server-side.", ) + @field_validator("metric") + @classmethod + def _metric_must_declare_type(cls, value: dict[str, Any]) -> dict[str, Any]: + """Reject payloads without a metric ``type`` discriminator at the API boundary. + + The metric body stays an open object (the concrete shape is validated when + the bundle is hydrated against the metric type union), but a non-empty + ``type`` is required so malformed payloads fail fast rather than at execution. + """ + metric_type = value.get("type") + if not isinstance(metric_type, str) or not metric_type: + raise ValueError("inline metric payload must include a non-empty 'type'") + return value + # Discriminated on ``kind`` so additional payload formats can join the union # without changing the field type.