-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathserver_launch.sh
More file actions
70 lines (66 loc) · 2.56 KB
/
Copy pathserver_launch.sh
File metadata and controls
70 lines (66 loc) · 2.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env bash
# Launch an sglang server with vortex sparse attention.
#
# Usage: ./server_launch.sh <MODEL_NAME> <TP_SIZE>
# Example: ./server_launch.sh Qwen/Qwen3-4B 1
#
# The vortex_* hyper-parameters are no longer individual CLI flags: sglang's
# ServerArgs now carries a single aggregated `vortex` field, exposed on the CLI
# as `--vortex-config '<json>'` (see vortex_torch/engine/sgl/config.py and
# third_party/.../server_args.py). Passing the old per-knob `--vortex-*` flags
# fails argparse. We therefore write the knobs to a JSON file and feed it
# through `--vortex-config`. Keys are the VortexConfig field names (the
# `vortex_` prefix is stripped). Providing a non-null vortex config implicitly
# enables sparsity.
export OPENAI_API_KEY="None"
MODEL_NAME=$1
TP_SIZE=$2
VORTEX_CONFIG_FILE="$(mktemp /tmp/vortex_config.XXXXXX.json)"
trap 'rm -f "$VORTEX_CONFIG_FILE"' EXIT
cat > "$VORTEX_CONFIG_FILE" <<'JSON'
{
"impl_backend": "triton",
"use_tensor_core": true,
"attention_backend": "trtllm",
"layers_skip": [],
"block_reserved_eos": 1,
"block_reserved_bos": 2,
"topk_val": 61,
"block_size": 32,
"workload_chunk_size": 64,
"module_name": "block_sparse_attention",
"max_seq_lens": 32768,
"max_topk_val": 256,
"dtype": "bfloat16",
"compilation_cache_dir": "~/.vortex_compilation_cache"
}
JSON
# NOTE: we cannot use `python -m sglang.launch_server` directly. That entrypoint
# builds `ServerArgs` in the parent process before anything imports vortex_torch,
# so the `--vortex-config` JSON string is never folded into a VortexConfig (the
# `ServerArgs.__init__` adapter that does this is installed by `import
# vortex_torch`). The raw string then gets pickled to the spawned scheduler
# worker, where `server_args.vortex_block_size` -> `getattr(str, "block_size")`
# raises AttributeError. Importing vortex_torch FIRST, in this parent process,
# installs the adapter so the conversion happens before ServerArgs is pickled.
python -c '
import os, sys
import vortex_torch # installs the ServerArgs adapter + backend integration
from sglang.launch_server import run_server
from sglang.srt.server_args import prepare_server_args
from sglang.srt.utils import kill_process_tree
server_args = prepare_server_args(sys.argv[1:])
try:
run_server(server_args)
finally:
kill_process_tree(os.getpid(), include_parent=False)
' \
--model-path "$MODEL_NAME" \
--page-size 32 \
--attention-backend "flashinfer" \
--vortex-config "$(cat "$VORTEX_CONFIG_FILE")" \
--context-length 32768 \
--mem-fraction-static 0.9 \
--tp-size "$TP_SIZE" \
--port 30000 \
--host 127.0.0.1