diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md index cf93c392fa6..16f1b2cb6a5 100644 --- a/.claude/skills/evaluation/SKILL.md +++ b/.claude/skills/evaluation/SKILL.md @@ -48,7 +48,7 @@ Run `nel --version`; if missing, instruct `pip install nemo-evaluator-launcher`. 1. Read the task reference file(s). 2. Use `recipes/examples/example_eval.yaml` as the base. 3. Copy the YAML fragment(s) into `evaluation.tasks`, applying any per-task notes. -4. **MLflow auto-export is on by default** — copy the `export.mlflow` block from `example_eval.yaml` verbatim. The defaults inside that block (Hydra-interpolated `experiment_name`, `description`, `tags`) only need `tracking_uri` filled in Step 4. See `example_eval.yaml` for the canonical block. +4. **MLflow auto-export is on by default** — it needs **two** pieces, both in `example_eval.yaml`: (a) the **trigger** `execution.auto_export.destinations: [mlflow]` (without it the run is *not* uploaded), and (b) the `export.mlflow` block that configures it. In the `export.mlflow` block use **literal** values for `experiment_name` / `description` / `tags` — substitute the actual `served_model_name` and sampling params. Do **not** use `${deployment.*}` / `${evaluation.*}` cross-references: with auto-export on, NEL resolves the export block at submit time in a scope without those nodes and fails with `Interpolation key '...' not found` (`${oc.env:USER}` is fine — it's an env var). Because these literals can't interpolate, keep the `temperature` / `top_p` / `max_new_tokens` tags **equal to** the top-level `params` and update both in the same edit — they're the only queryable record of sampling in MLflow (NEL doesn't log them as run params), so a stale tag silently misreports the run. Fill `tracking_uri` in Step 4. 5. Proceed to Step 3, then Step 4, then Step 7.5/8. Skip Step 2's 5-question flow. --- diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml index 0b6ea6b2e9b..c3a48d8c58b 100644 --- a/.claude/skills/evaluation/recipes/examples/example_eval.yaml +++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml @@ -45,9 +45,9 @@ execution: walltime: "04:00:00" mounts: mount_home: false - auto_export: - destinations: - - mlflow + auto_export: # REQUIRED trigger for auto-export. Without this, the + destinations: # export.mlflow block below is ignored and the run is + - mlflow # NOT uploaded — you'd have to `nel export` it by hand. deployment: env_vars: HF_TOKEN: host:HF_TOKEN @@ -95,18 +95,29 @@ evaluation: n_samples: 16 export: + # Use LITERAL values below — NOT ${deployment.*} / ${evaluation.*} cross-refs. + # With auto_export enabled (above), NEL resolves this block at SUBMIT time in a + # scope that does NOT include `deployment` / `evaluation`, so cross-references + # fail hard: "Interpolation key 'deployment.served_model_name' not found". + # `${oc.env:USER}` (an env-var interpolation) is fine. + # + # CAUTION — these literals can drift. temperature / top_p / max_new_tokens are the + # ONLY queryable record of the sampling config in MLflow (NEL does not log them as + # run params), so keep them — but because they can't be interpolated, they MUST be + # kept EQUAL to evaluation.nemo_evaluator_config.config.params above. When you + # change the sampling params (or served_model_name), update these literals in the + # SAME edit, or MLflow will misreport the run. mlflow: tracking_uri: ??? - experiment_name: ${oc.env:USER}/${deployment.served_model_name} - description: '${oc.env:USER}/${deployment.served_model_name} | T=${evaluation.nemo_evaluator_config.config.params.temperature}, top_p=${evaluation.nemo_evaluator_config.config.params.top_p}, - max_new_tokens=${evaluation.nemo_evaluator_config.config.params.max_new_tokens}' + experiment_name: ${oc.env:USER}/CHANGEME-served-model-name + description: 'CHANGEME-served-model-name | T=1.0, top_p=0.95, max_new_tokens=65536' log_logs: true log_artifacts: true only_required: false skip_existing: false tags: framework: vllm - model: ${deployment.served_model_name} - temperature: '${evaluation.nemo_evaluator_config.config.params.temperature}' - top_p: '${evaluation.nemo_evaluator_config.config.params.top_p}' - max_new_tokens: '${evaluation.nemo_evaluator_config.config.params.max_new_tokens}' + model: CHANGEME-served-model-name + temperature: '1.0' + top_p: '0.95' + max_new_tokens: '65536'