NVIDIA-NeMo
diff --git a/‎docs/libraries/nemo-evaluator/extending/byob/datasets.md‎
Lines changed: 42 additions & 1 deletion b/‎docs/libraries/nemo-evaluator/extending/byob/datasets.md‎
Lines changed: 42 additions & 1 deletion
diff --git a/‎docs/libraries/nemo-evaluator/extending/byob/scorers.md‎
Lines changed: 24 additions & 17 deletions b/‎docs/libraries/nemo-evaluator/extending/byob/scorers.md‎
Lines changed: 24 additions & 17 deletions
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/client/client.py‎
Lines changed: 30 additions & 0 deletions b/‎packages/nemo-evaluator/src/nemo_evaluator/client/client.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/cli.py‎
Lines changed: 2 additions & 1 deletion b/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/cli.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/compiler.py‎
Lines changed: 29 additions & 35 deletions b/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/compiler.py‎
Lines changed: 29 additions & 35 deletions
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/containerize.py‎
Lines changed: 7 additions & 5 deletions b/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/containerize.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/decorators.py‎
Lines changed: 14 additions & 20 deletions b/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/decorators.py‎
Lines changed: 14 additions & 20 deletions
@@ -136,12 +136,53 @@ If you put `hf://` URIs with `&` query parameters in shell command
 templates, quote the dataset argument:
 
 ```bash
---dataset "{{config.params.extra.dataset_uri}}"
+--dataset "{{config.params.extra.dataset.path}}"
 ```
 
 Otherwise the shell treats `&` as a background-command separator.
 ::::
 
+### `extra.dataset.*` namespace
+
+BYOB groups dataset-related configuration under
+`config.params.extra.dataset.*` in the FDF / run_config:
+
+| Key | Description |
+|-----|-------------|
+| `path` | Dataset file path or `hf://` URI (compile-time default from `@benchmark(dataset=...)`). |
+| `num_fewshot` | Optional few-shot example count (lm-eval-harness parity). |
+| `field_mapping` | Informational mirror of `@benchmark(field_mapping=...)`. |
+| `choices` / `choices_field` | Informational mirror of `@benchmark(choices=...)` / `@benchmark(choices_field=...)`. |
+
+### Overriding the dataset at run time
+
+The `@benchmark` decorator's `dataset=` value is the compile-time default. To
+swap it for a single run without rebuilding the benchmark, set
+`config.params.extra.dataset.path` via the launcher's run_config or CLI. The
+launcher deep-merges via OmegaConf, so sibling keys under `extra.dataset`
+(`num_fewshot`, `field_mapping`, etc.) and under `extra` (`benchmark_module`,
+`requirements`, …) are preserved.
+
+```bash
+nemo-evaluator-launcher run --config my_config.yaml \
+  -o 'evaluation.tasks.<task_name>.nemo_evaluator_config.config.params.extra.dataset.path=hf://other/foo?split=test'
+```
+
+Or in a run_config YAML:
+
+```yaml
+evaluation:
+  tasks:
+    - name: <task_name>
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              dataset:
+                path: hf://other/foo?split=test
+                num_fewshot: 5
+```
+
 ## Field Mapping
 
 Use `field_mapping` to rename dataset columns so they match the `{placeholder}` names in your prompt template. The mapping is applied after loading the dataset and before prompt rendering.
 
@@ -13,28 +13,34 @@ Every scorer receives a single `ScorerInput` dataclass importable from `nemo_eva
 class ScorerInput:
     response: str              # Model output (or argmax choice in logprob mode)
     target: Any                # Ground truth from dataset
-    metadata: dict             # Full dataset row as a dict
+    metadata: dict             # Dataset row + per-call response metadata
     model_call_fn: Optional[Callable] = None
     config: Dict[str, Any] = field(default_factory=dict)
     conversation: Optional[List[dict]] = None
     turn_index: Optional[int] = None
-    choices: Optional[List[str]] = None
-    choices_logprobs: Optional[List[float]] = None
-    choices_is_greedy: Optional[List[bool]] = None
 ```
 
 | Field | Description |
 |-------|-------------|
 | `response` | The model output text for the current sample. In `completions_logprob` mode this is set to the choice with the highest sum-logprob (i.e. the argmax). |
 | `target` | The ground-truth value read from the field specified by `target_field` in `@benchmark`. |
-| `metadata` | The entire dataset row as a dictionary, useful for accessing additional fields beyond the target. |
+| `metadata` | Shared bag for **dataset-row fields and per-call response metadata**. Standard scorers use it to access any column on the row (e.g. `sample.metadata["passage"]`). Strategies that produce extra per-call data write namespaced keys (prefixed with `_`) into this dict before invoking the scorer. |
 | `model_call_fn` | Reserved for multi-turn evaluation (not yet implemented). |
 | `config` | Extra configuration passed through `extra=` in `@benchmark` (e.g. judge settings). |
 | `conversation` | Reserved for multi-turn benchmarks (not yet implemented). |
 | `turn_index` | Reserved for multi-turn benchmarks (not yet implemented). |
-| `choices` | Populated by `MultipleChoiceStrategy` with the candidate continuation list (resolved from `choices=` or `choices_field=` on `@benchmark`). |
-| `choices_logprobs` | Per-choice sum log-probabilities returned by the loglikelihood call. Same length as `choices`. |
-| `choices_is_greedy` | Per-choice booleans: True when every continuation token equals the top-1 prediction (i.e. the choice would have been produced under greedy decoding). Same length as `choices`. |
+
+### Reserved metadata keys
+
+`MultipleChoiceStrategy` (selected by `endpoint_type="completions_logprob"`) writes the following keys into `ScorerInput.metadata` before invoking the scorer:
+
+| Key | Type | Description |
+|-----|------|-------------|
+| `_choices` | `list[str]` | Candidate continuations resolved from `choices=` or `choices_field=` on `@benchmark`. |
+| `_choices_logprobs` | `list[float]` | Per-choice sum log-probabilities returned by the loglikelihood call. Same length as `_choices`. |
+| `_choices_is_greedy` | `list[bool]` | Per-choice booleans: `True` when every continuation token equals the top-1 prediction (i.e. the choice would have been produced under greedy decoding). Same length as `_choices`. |
+
+`response` is also set to `_choices[argmax(_choices_logprobs)]` so legacy text-based scorers continue to work in logprob mode.
 
 ## The @scorer Decorator
 
@@ -196,15 +202,16 @@ The runner inspects `logprobs.text_offset` to locate the continuation
 token span, sums `token_logprobs` over that span, and decides
 `is_greedy` by checking whether each continuation token matches the
 top-1 entry of `top_logprobs`. The resulting per-choice
-`(sum_logprob, is_greedy)` tuples are placed on `ScorerInput.choices`,
-`choices_logprobs`, and `choices_is_greedy`. `multiple_choice_acc`
-then computes:
-
-- `acc` -- 1.0 iff `argmax(choices_logprobs) == gold_index` (MMLU
-  canonical).
-- `acc_norm` -- 1.0 iff `argmax(choices_logprobs[i] /
-  max(len(choices[i].encode("utf-8")), 1)) == gold_index` (ARC/BoolQ
-  canonical, per-byte length normalization).
+`(sum_logprob, is_greedy)` tuples are written into `ScorerInput.metadata`
+under the reserved keys `_choices`, `_choices_logprobs`, and
+`_choices_is_greedy`. `multiple_choice_acc` then computes:
+
+- `acc` -- 1.0 iff `argmax(metadata["_choices_logprobs"]) == gold_index`
+  (MMLU canonical).
+- `acc_norm` -- 1.0 iff
+  `argmax(metadata["_choices_logprobs"][i] /
+  max(len(metadata["_choices"][i].encode("utf-8")), 1)) == gold_index`
+  (ARC/BoolQ canonical, per-byte length normalization).
 - `acc_greedy` -- 1.0 iff the highest-loglikelihood **greedy** choice
   matches gold (diagnostic).
 
 
@@ -232,6 +232,36 @@ async def _make_request():
         response = await self._retry_with_backoff(_make_request)
         return response.choices[0].text or ""
 
+    async def loglikelihood(self, prompt: str, **kwargs) -> dict:
+        """Score *prompt* for per-token loglikelihoods (lm-eval-harness contract).
+
+        Posts ``/v1/completions`` with ``echo=true, logprobs=1, max_tokens=0``
+        so the server returns per-token log-probabilities for the entire
+        prompt without generating new tokens. Returns the full response body
+        as a dict so callers can inspect ``logprobs.tokens``,
+        ``logprobs.token_logprobs``, ``logprobs.text_offset``, and
+        ``logprobs.top_logprobs``.
+
+        Honours ``self.semaphore`` and ``self._retry_with_backoff`` exactly
+        like ``chat_completion`` / ``completion``.
+        """
+        params = {
+            "model": self.model_id,
+            "prompt": prompt,
+            "max_tokens": 0,
+            "temperature": 0.0,
+            "logprobs": 1,
+            "echo": True,
+            **kwargs,
+        }
+
+        async def _make_request():
+            async with self.semaphore:
+                return await self.client.completions.create(**params)
+
+        response = await self._retry_with_backoff(_make_request)
+        return response.model_dump()
+
     def completions(
         self,
         prompts: List[str],
 
@@ -202,7 +202,8 @@ def byob_compile(args=None):
         for name, fdf in compiled.items():
             eval_entry = fdf["evaluations"][0]
             print(f"  - {eval_entry['name']} (normalized: {name})")
-            ds = fdf["defaults"]["config"]["params"]["extra"]["dataset"]
+            ds_cfg = fdf["defaults"]["config"]["params"]["extra"]["dataset"]
+            ds = ds_cfg["path"] if isinstance(ds_cfg, dict) else ds_cfg
             print(f"    Dataset: {ds}")
             if os.path.exists(ds):
                 with open(ds) as f:
 
@@ -42,26 +42,22 @@
 # Jinja2 command template for runner invocation
 # NOTE: Use plain string concatenation to avoid f-string escaping issues with {{ }}
 #
-# Dataset resolution precedence (Req 2 - "swap the input file, keep same
-# task name / prompt / scoring"):
+# Dataset-related config is grouped under ``config.params.extra.dataset.*``:
 #
-#   1. config.params.extra.dataset_uri   (override; hf:// URI or local path)
-#   2. config.params.extra.dataset       (compile-time default from @benchmark)
+#   - ``path``        -- dataset file path or ``hf://`` URI (compile-time
+#                        default from ``@benchmark(dataset=...)``)
+#   - ``num_fewshot`` -- optional few-shot example count (lm-eval-harness
+#                        parity)
+#   - ``field_mapping``, ``choices``, ``choices_field`` -- informational
+#                        metadata; the runner picks up the live values from
+#                        the ``@benchmark`` registry, but they appear in the
+#                        FDF for inspection / override.
 #
-# When dataset_uri is set on a run (e.g. via
-# ``evaluation.tasks[0].nemo_evaluator_config.config.params.extra.dataset_uri=...``)
-# the runner fetches that URI instead, without any change to the benchmark
-# module, prompt template, scorer, or task name.
 COMMAND_TEMPLATE = (
     "python -m nemo_evaluator.contrib.byob.runner"
     " --benchmark-module {{config.params.extra.benchmark_module}}"
     " --benchmark-name {{config.params.task}}"
-    "{% if config.params.extra.dataset_uri is defined"
-    " and config.params.extra.dataset_uri is not none %}"
-    ' --dataset "{{config.params.extra.dataset_uri}}"'
-    "{% else %}"
-    ' --dataset "{{config.params.extra.dataset}}"'
-    "{% endif %}"
+    ' --dataset "{{config.params.extra.dataset.path}}"'
     " --output-dir {{config.output_dir}}"
     " --model-url {{target.api_endpoint.url}}"
     " --model-id {{target.api_endpoint.model_id}}"
@@ -84,9 +80,9 @@
     "{% if config.params.request_timeout is not none %}"
     " --request-timeout {{config.params.request_timeout}}"
     "{% endif %}"
-    "{% if config.params.extra.num_fewshot is defined"
-    " and config.params.extra.num_fewshot is not none %}"
-    " --num-fewshot {{config.params.extra.num_fewshot}}"
+    "{% if config.params.extra.dataset.num_fewshot is defined"
+    " and config.params.extra.dataset.num_fewshot is not none %}"
+    " --num-fewshot {{config.params.extra.dataset.num_fewshot}}"
     "{% endif %}"
 )
 
@@ -108,29 +104,27 @@ def _build_fdf(
     Returns:
         FDF dict ready for YAML serialization.
     """
-    extra_params: dict = {
-        "benchmark_module": benchmark_module_ref,
-        "dataset": dataset_path,
-        # ``dataset_uri`` is the Req 2 override slot: setting it at run
-        # time (e.g. to a different hf:// URI with the same schema) makes
-        # the BYOB runner load that dataset instead of ``dataset`` while
-        # keeping task name, prompt template, and scorer unchanged. Null
-        # by default so the compile-time ``dataset`` is used.
-        "dataset_uri": None,
-        "requirements": bench.requirements,
-    }
-    # Propagate field_mapping if declared
+    # Dataset-specific config grouped under ``extra.dataset.*`` so that all
+    # dataset-shaped settings (path, fewshot count, field mapping, candidate
+    # choices) live under one namespace and don't pollute the top level of
+    # ``extra``.
+    dataset_params: dict = {"path": dataset_path}
     if bench.field_mapping:
-        extra_params["field_mapping"] = bench.field_mapping
-    # Few-shot defaults (lm-eval-harness parity)
+        dataset_params["field_mapping"] = bench.field_mapping
     if bench.num_fewshot:
-        extra_params["num_fewshot"] = bench.num_fewshot
+        dataset_params["num_fewshot"] = bench.num_fewshot
     # Multiple-choice loglikelihood metadata (informational; the runner
-    # picks up choices/choices_field from the @benchmark itself)
+    # picks up choices/choices_field from the @benchmark registry itself).
     if bench.choices is not None:
-        extra_params["choices"] = list(bench.choices)
+        dataset_params["choices"] = list(bench.choices)
     if bench.choices_field is not None:
-        extra_params["choices_field"] = bench.choices_field
+        dataset_params["choices_field"] = bench.choices_field
+
+    extra_params: dict = {
+        "benchmark_module": benchmark_module_ref,
+        "dataset": dataset_params,
+        "requirements": bench.requirements,
+    }
     # Propagate judge config(s) from @benchmark kwargs
     # Supports: judge={...}, judge_1={...}, judge_2={...}, etc.
     for key, value in bench.extra_config.items():
 
@@ -118,7 +118,7 @@ def generate_dockerfile(
 def rewrite_fdf_paths(fdf: dict, pkg_name: str) -> dict:
     """Rewrite host-local paths in an FDF dict to container paths.
 
-    Transforms ``extra.benchmark_module`` and ``extra.dataset`` from
+    Transforms ``extra.benchmark_module`` and ``extra.dataset.path`` from
     absolute host paths to container-relative paths under ``/opt/byob/``.
 
     Args:
@@ -136,12 +136,13 @@ def rewrite_fdf_paths(fdf: dict, pkg_name: str) -> dict:
         filename = os.path.basename(benchmark_module)
         extra["benchmark_module"] = f"/opt/byob/code/{filename}"
 
-    dataset = extra.get("dataset", "")
+    dataset_cfg = extra.get("dataset") or {}
+    dataset = dataset_cfg.get("path", "") if isinstance(dataset_cfg, dict) else ""
     if dataset and not dataset.startswith(
         ("hf://", "s3://", "gs://", "http://", "https://")
     ):
         filename = os.path.basename(dataset)
-        extra["dataset"] = f"/opt/byob/data/{filename}"
+        dataset_cfg["path"] = f"/opt/byob/data/{filename}"
 
     return fdf
 
@@ -193,7 +194,8 @@ def prepare_build_context(
     # Copy or fetch dataset to data/
     data_dir = context / "data"
     data_dir.mkdir(parents=True, exist_ok=True)
-    dataset = extra.get("dataset", "")
+    dataset_cfg = extra.get("dataset") or {}
+    dataset = dataset_cfg.get("path", "") if isinstance(dataset_cfg, dict) else ""
     if dataset:
         if os.path.isfile(dataset):
             # Local file — copy directly
@@ -207,7 +209,7 @@ def prepare_build_context(
                 result = fetcher.fetch(dataset, cache_dir=data_dir)
                 # Update the FDF to point to the local filename inside /opt/byob/data/
                 local_name = result.local_path.name
-                extra["dataset"] = f"/opt/byob/data/{local_name}"
+                dataset_cfg["path"] = f"/opt/byob/data/{local_name}"
                 # Move/copy if fetched to a different location than data_dir
                 if result.local_path.parent != data_dir:
                     shutil.copy2(str(result.local_path), str(data_dir / local_name))
 
@@ -47,14 +47,18 @@ class ScorerInput:
 
     This is the single argument passed to all BYOB scorer functions.
     Standard scorers use response, target, and metadata.
-    Advanced scorers (judge, multi-turn, multiple-choice loglikelihood)
-    use the optional fields.
-
-    For multiple-choice loglikelihood evaluation (lm-eval-harness style),
-    ``MultipleChoiceStrategy`` populates ``choices``, ``choices_logprobs``,
-    and ``choices_is_greedy`` before invoking the scorer. ``response`` is
-    set to ``choices[argmax(choices_logprobs)]`` so legacy text-based
-    scorers also work.
+    Advanced scorers (judge, multi-turn) use the optional fields.
+
+    (e.g. ``MultipleChoiceStrategy``) write namespaced keys into
+    ``metadata`` before invoking the scorer. Reserved keys currently in
+    use:
+
+    * ``_choices`` -- candidate continuations (list[str])
+    * ``_choices_logprobs`` -- per-choice sum log-probabilities (list[float])
+    * ``_choices_is_greedy`` -- per-choice greedy flags (list[bool])
+
+    ``response`` is set to ``choices[argmax(choices_logprobs)]`` for
+    multiple-choice mode so legacy text-based scorers also work.
     """
 
     response: str
@@ -65,10 +69,6 @@ class ScorerInput:
     config: Dict[str, Any] = field(default_factory=dict)
     conversation: Optional[List[dict]] = None
     turn_index: Optional[int] = None
-    # Multiple-choice loglikelihood fields (mirrors lm-eval-harness)
-    choices: Optional[List[str]] = None
-    choices_logprobs: Optional[List[float]] = None
-    choices_is_greedy: Optional[List[bool]] = None
 
 
 @dataclass
@@ -89,10 +89,8 @@ class BenchmarkDefinition:
     _is_jinja2: bool = False
     system_prompt: Optional[str] = None
     _is_system_prompt_jinja2: bool = False
-    # Multiple-choice loglikelihood support (mirrors lm-eval-harness)
     choices: Optional[List[str]] = None
     choices_field: Optional[str] = None
-    # Few-shot prompting (mirrors lm-eval-harness --num_fewshot)
     num_fewshot: int = 0
     fewshot_split: Optional[str] = None
     fewshot_template: Optional[str] = None
@@ -186,10 +184,7 @@ def benchmark(
         prompt: Python format string with {field} placeholders, or path to
                 a template file (.txt, .md, .jinja, .jinja2).
         target_field: JSONL field containing ground truth.
-        endpoint_type: ``"chat"``, ``"completions"``, or
-            ``"completions_logprob"``. The last value enables per-choice
-            loglikelihood ranking (lm-evaluation-harness ``local-completions``
-            parity) and requires either ``choices`` or ``choices_field``.
+        endpoint_type: ``"chat"``, ``"completions"``, or ``"completions_logprob"``.
         requirements: Pip dependencies. Either a list of specifiers
                       (e.g., ["rouge-score>=0.1.2"]) or a path to a
                       requirements.txt file. None means no extra deps.
@@ -215,8 +210,7 @@ def benchmark(
         num_fewshot: Number of few-shot examples to prepend to each
             prompt. Examples are sampled deterministically from
             ``fewshot_split`` (or the first ``num_fewshot`` rows of the
-            evaluation dataset when ``fewshot_split`` is None). Mirrors
-            lm-eval-harness's ``--num_fewshot`` flag.
+            evaluation dataset when ``fewshot_split`` is None).
         fewshot_split: HuggingFace split name to sample few-shot examples
             from (e.g. ``"train"`` or ``"dev"``). Only meaningful when the
             primary ``dataset`` is an ``hf://`` URI.