|
| 1 | +[ |
| 2 | + { |
| 3 | + "name": "nemotron3-nano-bf16-reasoning", |
| 4 | + "skills": ["evaluation"], |
| 5 | + "query": "Help me evaluate Nemotron 3 Nano BF16 from NVIDIA", |
| 6 | + "files": [], |
| 7 | + "expected_behavior": [ |
| 8 | + "Verifies nel is installed by running 'nel --version'", |
| 9 | + "Asks all 5 base config questions (execution, deployment, auto-export, model type, benchmarks) before generating the config", |
| 10 | + "Runs 'nel skills build-config' with correct flags matching user answers: --execution slurm --deployment vllm --model-type reasoning --benchmarks standard code math_reasoning --export mlflow", |
| 11 | + "Searches the web for the model card on HuggingFace and extracts model-specific settings", |
| 12 | + "Sets correct HF handle: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", |
| 13 | + "Sets reasoning sampling params from model card: temperature=1.0, top_p=1.0", |
| 14 | + "Configures reasoning toggle via params_to_add with chat_template_kwargs.enable_thinking (not via system prompt)", |
| 15 | + "Disables reasoning for IFEval task using enable_thinking: false with use_system_prompt: false", |
| 16 | + "Adds deployment.pre_cmd using curl (not wget) to download nano_v3_reasoning_parser.py from HuggingFace", |
| 17 | + "Adds vLLM extra_args including --trust-remote-code, --reasoning-parser-plugin, --reasoning-parser nano_v3, --max-num-seqs 8", |
| 18 | + "Pins vLLM image to v0.12.0 or later as required by model card", |
| 19 | + "Adds target.api_endpoint.api_key_name: DUMMY_API_KEY for nemo_skills tasks with self-deployment", |
| 20 | + "Fills in all ??? placeholders after asking the user for SLURM hostname, account, output_dir, MLflow tracking_uri, and experiment_name", |
| 21 | + "Applies user-requested SLURM customizations: partition batch_short, walltime 00:20:00, MLflow tag scenario: demo", |
| 22 | + "Presents task list and waits for user confirmation before proceeding", |
| 23 | + "Configures request and response logging interceptors under evaluation.nemo_evaluator_config.config.target.api_endpoint.adapter_config using correct field names (max_logged_requests/max_logged_responses, not max_saved_*)", |
| 24 | + "Handles dry-run failure for missing HF_TOKEN_FOR_GPQA_DIAMOND by offering to fix the config", |
| 25 | + "Successfully submits test run with limit_samples=10 after dry-run passes", |
| 26 | + "Provides monitoring commands (nel status, nel info --logs) and inspects server logs via SSH when asked" |
| 27 | + ] |
| 28 | + }, |
| 29 | + { |
| 30 | + "name": "quantized-checkpoint-local-vllm", |
| 31 | + "skills": ["evaluation"], |
| 32 | + "query": "evaluate my FP8 quantized Llama checkpoint at ./llama-3.1-8b-fp8 on MMLU and GSM8K", |
| 33 | + "files": [], |
| 34 | + "expected_behavior": [ |
| 35 | + "Verifies nel is installed by running nel --version", |
| 36 | + "Asks all 5 base config questions (execution, deployment, auto-export, model type, benchmarks)", |
| 37 | + "Runs nel skills build-config with correct flags matching user answers", |
| 38 | + "Sets deployment.checkpoint_path to ./llama-3.1-8b-fp8 and deployment.hf_model_handle to null", |
| 39 | + "Auto-detects quantization format by reading ./llama-3.1-8b-fp8/hf_quant_config.json", |
| 40 | + "Finds quant_algo=FP8 and adds --quantization modelopt to deployment.extra_args", |
| 41 | + "Recommends accuracy-sensitive benchmarks from references/quantization-benchmarks.md", |
| 42 | + "Searches web for Llama-3.1-8B model card and extracts sampling params, context length, TP settings", |
| 43 | + "Fills in remaining missing values by asking user", |
| 44 | + "Runs dry-run, then test with limit_samples=10, then full evaluation", |
| 45 | + "Reports accuracy results per benchmark" |
| 46 | + ] |
| 47 | + }, |
| 48 | + { |
| 49 | + "name": "slurm-quantized-model", |
| 50 | + "skills": ["evaluation"], |
| 51 | + "query": "Evaluate my quantized Llama-3.1-8B-FP8 checkpoint on mmlu and gsm8k on the SLURM cluster", |
| 52 | + "files": [], |
| 53 | + "expected_behavior": [ |
| 54 | + "Verifies nel is installed by running nel --version", |
| 55 | + "Asks 5 base config questions with execution=slurm pre-selected based on user request", |
| 56 | + "Runs nel skills build-config with --execution slurm --deployment vllm --benchmarks standard", |
| 57 | + "Detects FP8 quantization from hf_quant_config.json and sets deployment.extra_args with --quantization modelopt", |
| 58 | + "Reads references/quantization-benchmarks.md and recommends accuracy-sensitive benchmarks", |
| 59 | + "Uses WebSearch to research model card for sampling params and context length", |
| 60 | + "Fills in SLURM-specific values: hostname, account, partition from user input", |
| 61 | + "Runs dry-run validation before full evaluation", |
| 62 | + "Provides SSH-based log monitoring commands for SLURM execution" |
| 63 | + ] |
| 64 | + } |
| 65 | +] |
0 commit comments