Skip to content

Commit abbda41

Browse files
author
pensieve-intern
committed
[OMNIML-4740] synth_support — pensieve-intern agent draft
1 parent e2d29c8 commit abbda41

2 files changed

Lines changed: 60 additions & 2 deletions

File tree

tools/launcher/common/service_utils.sh

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
native_mpi_rank=$OMPI_COMM_WORLD_RANK
1919
native_mpi_local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
2020
# Works with Slurm launching with `--mpi=pmix`
21-
mpi_rank=${PMIX_RANK:-$native_mpi_rank}
22-
mpi_local_rank=${PMIX_LOCAL_RANK:-$native_mpi_local_rank}
21+
mpi_rank=${PMIX_RANK:-${native_mpi_rank:-${SLURM_PROCID:-0}}}
22+
mpi_local_rank=${PMIX_LOCAL_RANK:-${native_mpi_local_rank:-${SLURM_LOCALID:-0}}}
2323

2424
FAIL=0
2525
FAIL_EXIT=0
@@ -48,8 +48,23 @@ function report_result {
4848
}
4949

5050
function util_install_extra_dep {
51+
local _marker=/tmp/.nmm_extra_dep_installed
52+
if [[ -f "$_marker" ]]; then
53+
return 0
54+
fi
5155
if [[ "$mpi_local_rank" -eq 0 ]]; then
5256
pip install diskcache
57+
local _nvrx_dir
58+
_nvrx_dir="$(mktemp -d)/nvidia-resiliency-ext"
59+
git clone --depth 1 https://github.com/NVIDIA/nvidia-resiliency-ext "${_nvrx_dir}" \
60+
&& pip install "${_nvrx_dir}"
61+
touch "$_marker"
62+
else
63+
local _waited=0
64+
while [[ ! -f "$_marker" && $_waited -lt 600 ]]; do
65+
sleep 1
66+
_waited=$((_waited + 1))
67+
done
5368
fi
5469
}
5570

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# DFlash offline synthetic data generation pipeline for Kimi-K2.5.
2+
#
3+
# 1-step pipeline (task_0 only):
4+
# task_0: Data synthesis — query vLLM server to generate prompt samples
5+
#
6+
# Usage:
7+
# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_offline_dflash.yaml --yes
8+
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_offline_dflash.yaml --yes
9+
10+
job_name: Kimi-K2.5_DFlash_offline
11+
pipeline:
12+
allow_to_fail: false
13+
skip: false
14+
note:
15+
16+
global_vars:
17+
hf_model: /hf-local/moonshotai/Kimi-K2.6
18+
19+
# Step 1: Data synthesis via vLLM server
20+
# Args before "--" go to vllm-serve; args after "--" go to tools/query.py.
21+
task_0:
22+
script: common/vllm/query.sh
23+
args:
24+
- --model <<global_vars.hf_model>>
25+
- --tensor-parallel-size 8
26+
- --port 8000
27+
- --host 0.0.0.0
28+
- --trust-remote-code
29+
- --enforce-eager
30+
- --gpu-memory-utilization 0.95
31+
- --max-model-len 4096
32+
- --
33+
- --data /nemo_run/code/modules/Model-Optimizer/examples/dataset/synthetic_conversations_1k.jsonl
34+
- --save /scratchspace/data
35+
environment:
36+
- HF_LOCAL: /hf-local
37+
- VLLM_STARTUP_TIMEOUT: "1800"
38+
slurm_config:
39+
_factory_: "slurm_factory"
40+
nodes: 1
41+
ntasks_per_node: 1
42+
gpus_per_node: 8
43+
container: vllm/vllm-openai:latest

0 commit comments

Comments
 (0)