Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/evaluator/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ Submit your evaluation to the Evaluator service using the NeMo Platform SDK:

```python
from nemo_evaluator.sdk import Evaluator
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager
from nemo_platform import NeMoPlatform


Expand All @@ -74,7 +73,6 @@ job = evaluator.submit(
metric=metric,
dataset=dataset,
config=config,
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
job.wait_until_done()
result = job.get_result()
Expand Down
2 changes: 0 additions & 2 deletions docs/evaluator/metrics/agent-configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ from nemo_evaluator_sdk import Agent, RunConfigOnline


from nemo_evaluator_sdk import ExactMatchMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager
metric = ExactMatchMetric(reference="{{item.expected_answer}}")
agent = Agent(
url="https://my-nat-agent.example.com",
Expand All @@ -185,7 +184,6 @@ job = evaluator.submit(
{"role": "user", "content": "{{item.question}}"},
],
},
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
job.wait_until_done()
result = job.get_result()
Expand Down
14 changes: 0 additions & 14 deletions docs/evaluator/metrics/agentic.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,6 @@ print(result.aggregate_scores)
```python
from nemo_evaluator_sdk import RunConfig
from nemo_evaluator_sdk.metrics.ragas import ToolCallAccuracyMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager
metric = ToolCallAccuracyMetric()

job = evaluator.submit(
Expand All @@ -251,7 +250,6 @@ job = evaluator.submit(
}
],
config=RunConfig(parallelism=4),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
job.wait_until_done()
result = job.get_result()
Expand Down Expand Up @@ -430,7 +428,6 @@ print(result.aggregate_scores)

```python
from nemo_evaluator_sdk import RunConfig, ToolCallingMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

metric = ToolCallingMetric(reference="{{item.tool_calls}}")

Expand Down Expand Up @@ -465,7 +462,6 @@ job = evaluator.submit(
}
],
config=RunConfig(parallelism=4),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
job.wait_until_done()
result = job.get_result()
Expand Down Expand Up @@ -572,7 +568,6 @@ print(result.aggregate_scores)
```python
from nemo_evaluator_sdk import RunConfig, Model
from nemo_evaluator_sdk.metrics.ragas import TopicAdherenceMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

judge_model = Model(
url="https://integrate.api.nvidia.com/v1/chat/completions",
Expand All @@ -596,7 +591,6 @@ job = evaluator.submit(
}
],
config=RunConfig(parallelism=4),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
job.wait_until_done()
result = job.get_result()
Expand Down Expand Up @@ -768,7 +762,6 @@ print(result.aggregate_scores)
```python
from nemo_evaluator_sdk import RunConfig, Model
from nemo_evaluator_sdk.metrics.ragas import AgentGoalAccuracyMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

judge_model = Model(
url="https://integrate.api.nvidia.com/v1/chat/completions",
Expand Down Expand Up @@ -798,7 +791,6 @@ job = evaluator.submit(
}
],
config=RunConfig(parallelism=4),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
job.wait_until_done()
result = job.get_result()
Expand Down Expand Up @@ -925,7 +917,6 @@ print(result.aggregate_scores)
```python
from nemo_evaluator_sdk import RunConfig, Model
from nemo_evaluator_sdk.metrics.ragas import AgentGoalAccuracyMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

judge_model = Model(
url="https://integrate.api.nvidia.com/v1/chat/completions",
Expand Down Expand Up @@ -963,7 +954,6 @@ job = evaluator.submit(
}
],
config=RunConfig(parallelism=4),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
job.wait_until_done()
result = job.get_result()
Expand Down Expand Up @@ -1023,7 +1013,6 @@ print(result.aggregate_scores)
```python
from nemo_evaluator_sdk import RunConfig, Model
from nemo_evaluator_sdk.metrics.ragas import AnswerAccuracyMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

judge_model = Model(
url="https://integrate.api.nvidia.com/v1/chat/completions",
Expand All @@ -1042,7 +1031,6 @@ job = evaluator.submit(
}
],
config=RunConfig(parallelism=4),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
job.wait_until_done()
result = job.get_result()
Expand All @@ -1055,7 +1043,6 @@ print(result.aggregate_scores)
```python
from nemo_evaluator_sdk import RunConfigOnlineModel, InferenceParams, Model
from nemo_evaluator_sdk.metrics.ragas import AnswerAccuracyMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

judge_model = Model(
url="https://integrate.api.nvidia.com/v1/chat/completions",
Expand Down Expand Up @@ -1090,7 +1077,6 @@ job = evaluator.submit(
}
]
},
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)

job.wait_until_done()
Expand Down
2 changes: 0 additions & 2 deletions docs/evaluator/metrics/job-management.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ from nemo_evaluator.sdk import Evaluator
from nemo_platform import NeMoPlatform
from nemo_evaluator_sdk import RunConfig
from nemo_evaluator_sdk import ExactMatchMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

sdk = NeMoPlatform(
base_url=os.environ.get("NMP_BASE_URL", "http://localhost:8080"),
Expand All @@ -38,7 +37,6 @@ job = evaluator.submit(
{"expected": "Berlin", "output": "Munich"},
],
config=RunConfig(parallelism=4),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
print("Submitted job:", job.name)

Expand Down
20 changes: 8 additions & 12 deletions docs/evaluator/metrics/llm-as-a-judge.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,6 @@ For production workloads, submit the same metric and dataset as a durable platfo

```python
from nemo_evaluator_sdk import RunConfig, JSONScoreParser, Model, RubricScore, LLMJudgeMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

metric = LLMJudgeMetric(
model=Model(
Expand Down Expand Up @@ -347,7 +346,6 @@ job = evaluator.submit(
{"input": "What is 2 + 2?", "output": "4"},
],
config=RunConfig(parallelism=8, limit_samples=100),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
print("Submitted job:", job.name)

Expand Down Expand Up @@ -436,13 +434,13 @@ By default, the JSON parser is used for range and rubric scores, with the score

```python
# JSON parser (default)
"parser": {"type": "json", "json_path": "quality"}
parser = {"type": "json", "json_path": "quality"}

# Regex parser (for models that do not support structured output)
"parser": {"type": "regex", "pattern": "QUALITY: (\\w+)"}
parser = {"type": "regex", "pattern": "QUALITY: (\\w+)"}

# Regex parser with method='search' (finds pattern anywhere in text)
"parser": {"type": "regex", "pattern": "SCORE: (\\d+)", "method": "search"}
parser = {"type": "regex", "pattern": "SCORE: (\\d+)", "method": "search"}
```

<Tip>
Expand Down Expand Up @@ -642,15 +640,13 @@ metric = {
Control judge model behavior with inference parameters:

```python

"prompt_template": {
prompt_template = {
"messages": [...],
"temperature": 0.1, # Lower for more consistent scoring
"max_tokens": 1024, # Increase if judge needs more space
"timeout": 30, # Request timeout in seconds
"stop": ["<{{ end_of_text }}>"] # Stop sequences
"temperature": 0.1, # Lower for more consistent scoring
"max_tokens": 1024, # Increase if judge needs more space
"timeout": 30, # Request timeout in seconds
"stop": ["<end_of_text>"], # Stop sequences
}

```

<Note>
Expand Down
2 changes: 0 additions & 2 deletions docs/evaluator/metrics/manage-metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ For online evaluations, provide a model or agent target and use the online param

```python
from nemo_evaluator_sdk import RunConfig, ExactMatchMetric
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}")

Expand All @@ -109,7 +108,6 @@ job = evaluator.submit(
{"expected": "Berlin", "output": "Munich"},
],
config=RunConfig(parallelism=4),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)

job.wait_until_done()
Expand Down
2 changes: 0 additions & 2 deletions docs/evaluator/metrics/model-configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -208,14 +208,12 @@ Durable remote `evaluator.submit(...)` jobs additionally accept a `ModelRef` tar

```python
from nemo_evaluator_sdk import ModelRef, RunConfigOnlineModel
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager

job = evaluator.submit(
metric=metric,
dataset=dataset,
config=RunConfigOnlineModel(),
target=ModelRef(root="default/my-model"),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
```

Expand Down
Loading
Loading