Skip to content

Commit 50be962

Browse files
author
pensieve-intern
committed
[OMNIML-4740] synth_support — pensieve-intern agent draft
1 parent e2d29c8 commit 50be962

2 files changed

Lines changed: 55 additions & 2 deletions

File tree

tools/launcher/common/service_utils.sh

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
native_mpi_rank=$OMPI_COMM_WORLD_RANK
1919
native_mpi_local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
2020
# Works with Slurm launching with `--mpi=pmix`
21-
mpi_rank=${PMIX_RANK:-$native_mpi_rank}
22-
mpi_local_rank=${PMIX_LOCAL_RANK:-$native_mpi_local_rank}
21+
mpi_rank=${PMIX_RANK:-${native_mpi_rank:-${SLURM_PROCID:-0}}}
22+
mpi_local_rank=${PMIX_LOCAL_RANK:-${native_mpi_local_rank:-${SLURM_LOCALID:-0}}}
2323

2424
FAIL=0
2525
FAIL_EXIT=0
@@ -48,8 +48,23 @@ function report_result {
4848
}
4949

5050
function util_install_extra_dep {
51+
local _marker=/tmp/.nmm_extra_dep_installed
52+
if [[ -f "$_marker" ]]; then
53+
return 0
54+
fi
5155
if [[ "$mpi_local_rank" -eq 0 ]]; then
5256
pip install diskcache
57+
local _nvrx_dir
58+
_nvrx_dir="$(mktemp -d)/nvidia-resiliency-ext"
59+
git clone --depth 1 https://github.com/NVIDIA/nvidia-resiliency-ext "${_nvrx_dir}" \
60+
&& pip install "${_nvrx_dir}"
61+
touch "$_marker"
62+
else
63+
local _waited=0
64+
while [[ ! -f "$_marker" && $_waited -lt 600 ]]; do
65+
sleep 1
66+
_waited=$((_waited + 1))
67+
done
5368
fi
5469
}
5570

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# DFlash offline speculative decoding pipeline for Kimi-K2.5 (input model).
2+
# This YAML contains only task_0 (data synthesis) for synth_support stage.
3+
# Tasks 1-3 are added by downstream stages.
4+
5+
job_name: Kimi-K2.5_DFlash_offline
6+
pipeline:
7+
allow_to_fail: false
8+
skip: false
9+
note:
10+
11+
global_vars:
12+
hf_model: /hf-local/moonshotai/Kimi-K2.6
13+
14+
# Step 1: Data synthesis via vLLM server
15+
# Args before "--" go to vLLM server; args after "--" go to tools/query.py.
16+
task_0:
17+
script: common/vllm/query.sh
18+
args:
19+
- --model <<global_vars.hf_model>>
20+
- --tensor-parallel-size 8
21+
- --port 8000
22+
- --host 0.0.0.0
23+
- --trust_remote_code
24+
- --enforce-eager
25+
- --gpu-memory-utilization 0.95
26+
- --max-model-len 4096
27+
- --
28+
- --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
29+
- --save /scratchspace/data
30+
environment:
31+
- HF_LOCAL: /hf-local
32+
- VLLM_STARTUP_TIMEOUT: "1800"
33+
slurm_config:
34+
_factory_: "slurm_factory"
35+
nodes: 1
36+
ntasks_per_node: 1
37+
gpus_per_node: 8
38+
container: vllm/vllm-openai:latest

0 commit comments

Comments
 (0)