|
| 1 | +# profiling_example.yaml |
| 2 | +# ----------------------------------------------------------------------------- |
| 3 | +# Example settings overlay showing how to drive the `profiling` stage with |
| 4 | +# multiple models. Wire three widely-used models — Qwen3-30B-A3B, |
| 5 | +# Nemotron-Super, Gemma — under the shared GPT-OSS-120B judge. |
| 6 | +# |
| 7 | +# Adapt to your cluster: |
| 8 | +# * Replace the `model:` strings below with local paths (e.g. |
| 9 | +# /path/to/models/Qwen3-30B-A3B) if you have the weights pre-staged, |
| 10 | +# or keep the HF hub identifiers to let vLLM download on first use. |
| 11 | +# * Tune `server_gpus`, `server_nodes`, and sampling params |
| 12 | +# (temperature / top_p / top_k / tokens_to_generate) per model. |
| 13 | +# * Gemma-4 needs a vLLM build that supports the `gemma4` architecture; |
| 14 | +# set `server_container` on the Gemma entry if your cluster's default |
| 15 | +# vLLM image predates that support. |
| 16 | +# |
| 17 | +# Usage: |
| 18 | +# ns run ... --settings profiling_example |
| 19 | +# ----------------------------------------------------------------------------- |
| 20 | + |
| 21 | +stages: |
| 22 | + profiling: |
| 23 | + # Shared judge — overridden per-model if any entry sets its own judge_kwargs. |
| 24 | + # NOTE: do not set `generation_type` in judge_kwargs.args — the orchestrator |
| 25 | + # passes `generation_type="math_judge"` to generate() explicitly; a duplicate |
| 26 | + # would raise TypeError. |
| 27 | + judge_kwargs: |
| 28 | + args: |
| 29 | + model: openai/gpt-oss-120b |
| 30 | + server_type: vllm |
| 31 | + server_gpus: 8 |
| 32 | + server_nodes: 1 |
| 33 | + num_chunks: 5 |
| 34 | + ctx_args: >- |
| 35 | + ++prompt_config=judge/general-judge |
| 36 | +
|
| 37 | + models: |
| 38 | + # ----- Qwen3-30B-A3B ----------------------------------------------------- |
| 39 | + - name: qwen3-30b-a3b |
| 40 | + generation_kwargs: |
| 41 | + args: |
| 42 | + model: Qwen/Qwen3-30B-A3B |
| 43 | + server_type: vllm |
| 44 | + server_gpus: 4 |
| 45 | + server_nodes: 1 |
| 46 | + num_random_seeds: 5 |
| 47 | + num_chunks: 20 |
| 48 | + ctx_args: >- |
| 49 | + ++prompt_config=generic/general-boxed |
| 50 | + ++inference.tokens_to_generate=16000 |
| 51 | +
|
| 52 | + # ----- Nemotron-Super ----------------------------------------------------- |
| 53 | + # Use the publicly available Llama-3.3-Nemotron-Super variant by default. |
| 54 | + # Swap to the internal Nemotron-Super-120B HF id / local path if you have |
| 55 | + # access. Recommended sampling: temp=0.6, top_p=0.95 (per model card). |
| 56 | + - name: nemotron-super |
| 57 | + generation_kwargs: |
| 58 | + args: |
| 59 | + model: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 |
| 60 | + server_type: vllm |
| 61 | + server_gpus: 8 |
| 62 | + server_nodes: 1 |
| 63 | + num_random_seeds: 5 |
| 64 | + num_chunks: 20 |
| 65 | + ctx_args: >- |
| 66 | + ++prompt_config=generic/general-boxed |
| 67 | + ++inference.tokens_to_generate=16000 |
| 68 | + ++inference.temperature=0.6 |
| 69 | + ++inference.top_p=0.95 |
| 70 | +
|
| 71 | + # ----- Gemma-4-31B-IT ----------------------------------------------------- |
| 72 | + # Requires a vLLM image with Gemma-4 architecture support. If your default |
| 73 | + # `containers.vllm` fails with `Unrecognized architecture: gemma4`, set a |
| 74 | + # gemma-compatible image here, e.g.: |
| 75 | + # server_container: /path/to/vllm-gemma.sqsh |
| 76 | + # Recommended sampling (per model card): temp=1.0, top_p=0.95, top_k=64. |
| 77 | + - name: gemma-4-31b-it |
| 78 | + generation_kwargs: |
| 79 | + args: |
| 80 | + model: google/gemma-4-31b-it |
| 81 | + server_type: vllm |
| 82 | + server_gpus: 8 |
| 83 | + server_nodes: 1 |
| 84 | + num_random_seeds: 5 |
| 85 | + num_chunks: 20 |
| 86 | + ctx_args: >- |
| 87 | + ++prompt_config=generic/general-boxed |
| 88 | + ++inference.endpoint_type=chat |
| 89 | + ++inference.tokens_to_generate=16000 |
| 90 | + ++inference.temperature=1.0 |
| 91 | + ++inference.top_p=0.95 |
| 92 | + ++inference.top_k=64 |
0 commit comments