-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathrun_benchmark_breakdown.sh
More file actions
executable file
·78 lines (68 loc) · 2.25 KB
/
run_benchmark_breakdown.sh
File metadata and controls
executable file
·78 lines (68 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Santacoder prefill.
# ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0
# Santacoder decode (fewer data points because slower)
# ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1
MODEL_NAME=${1:-"santacoder"}
MODEL_PATH=${2:-"bigcode/gpt_bigcode-santacoder"}
BATCH_SIZE=${3:-32}
MAX_NEW_TOKENS=${4:-2040}
# Prime number to see key length padding effect.
TOKEN_STEP=${5:-5}
STEP_ID=${6:-""}
FILE_PREFIX=${7:-""}
CYCLES=${8:-10}
SAVE_DIR=data/benchmarks/v2
#BATCH_SIZES="1 2 4 8 16 24 32 48 64 96 128 160 224 256"
RUN="python3 src/main.py --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom"
RUNTIME=("" "pre_allocate_kv_cache=True" "pre_allocate_kv_cache=True inference_runner=3")
RUNTIME_NAMES=("base" "pre_allocate" "graph")
ATTN=( \
"attention_implementation=0" \
"attention_implementation=1" \
"attention_implementation=1 --pad_generated_tokens=0.5" \
"attention_implementation=2" \
"attention_implementation=0 fused_softmax=False" \
"attention_implementation=0 fused_softmax=True" \
"attention_implementation=3" \
"attention_implementation=4" \
"attention_implementation=5" \
)
ATTN_NAME=( \
"default" \
"flash" \
"flash_unpad_50" \
"torch" \
"no_jit" \
"jit" \
"torchflash" \
"torchmem" \
"torchcpp" \
)
STEP=("--no_prefill" "--no_cache")
STEP_NAME=("decode" "prefill")
COMMON="--pretrained_model=$MODEL_PATH --tokenizer=$MODEL_PATH --cycles=$CYCLES --max_input_length=1 --max_new_tokens=$MAX_NEW_TOKENS --key_length_step=$TOKEN_STEP --batch_size=$BATCH_SIZE predict_last_token=True"
run () { # run(step, runtime, attn)
FILE_NAME="$SAVE_DIR"/"$MODEL_NAME"_bs_"$BATCH_SIZE"_tok_"$MAX_NEW_TOKENS"_step_"$TOKEN_STEP"_"${STEP_NAME[$1]}"/"$FILE_PREFIX""${RUNTIME_NAMES[$2]}"_"${ATTN_NAME[$3]}".json
if [ -f "$FILE_NAME" ];
then
echo "Skipping existing $FILE_NAME"
else
CMD="$RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save=$FILE_NAME"
echo "$CMD"
$CMD
fi
}
if [ "${STEP_ID}" -eq "0" ]
then
# Decode (default attn only)
for runtime in {0..2}
do
run 0 $runtime 0
done
else
# Prefill (all runtimes are the same)
for attn in {0..2}
do
run 1 0 $attn
done
fi