33pip_packages=' numpy'
44target_dir=./internal_tools
55
6+ # One-time pre-step: install nsys (NVIDIA Nsight Systems) if not present
7+ do_once () {
8+ apt update && apt install -y --no-install-recommends gnupg
9+ echo " deb http://developer.download.nvidia.com/devtools/repos/ubuntu$( source /etc/lsb-release && echo " $DISTRIB_RELEASE " | tr -d .) /$( dpkg --print-architecture) /" \
10+ | tee /etc/apt/sources.list.d/nvidia-devtools.list
11+ apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
12+ apt update && apt install -y nsight-systems-cli
13+ }
14+
615LOG1=" dali_legacy.log"
716LOG2=" dali_nvimgcodec.log"
817LOG1_TP=" dali_legacy_new_tp.log"
918LOG2_TP=" dali_nvimgcodec_new_tp.log"
1019LOG1_NDD=" dali_ndd_legacy.log"
1120LOG2_NDD=" dali_ndd_nvimgcodec.log"
21+ LOG2_32STREAMS=" dali_nvimgcodec_32streams.log"
22+ LOG2_NDD_32STREAMS=" dali_ndd_nvimgcodec_32streams.log"
23+
1224function CLEAN_AND_EXIT {
1325 rm -rf ${LOG1}
1426 rm -rf ${LOG2}
1527 rm -rf ${LOG1_TP}
1628 rm -rf ${LOG2_TP}
1729 rm -rf ${LOG1_NDD}
1830 rm -rf ${LOG2_NDD}
31+ rm -rf ${LOG2_32STREAMS}
32+ rm -rf ${LOG2_NDD_32STREAMS}
1933 exit $1
2034}
2135
36+ # Run a single benchmark; if NSYS_REP is set, wrap with nsys and write that profile.
37+ run_bench () {
38+ local log_file=" $1 "
39+ shift
40+ if [ -n " ${NSYS_REP} " ]; then
41+ nsys profile -o " ${NSYS_REP} " --stats=true " $@ " | tee " ${log_file} "
42+ else
43+ " $@ " | tee " ${log_file} "
44+ fi
45+ }
46+
47+ # SPEC:DA-11356-002_v04 - run all benchmarks (optionally with nsys when NSYS_REP is set per run).
48+ run_all_benchmarks () {
49+ if [ " $( uname -p) " == " x86_64" ]; then
50+ # Hopper
51+ TASKSET=" taskset --cpu-list 0-127"
52+ BENCH_ARGS=" --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -j 70 --hw_load 0.12"
53+ else
54+ # GraceHopper
55+ TASKSET=" taskset --cpu-list 0-71"
56+ BENCH_ARGS=" --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -j 72 --hw_load 0.11"
57+ fi
58+ run_bench " ${LOG1} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50
59+ run_bench " ${LOG2} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50 --experimental_decoder
60+ DALI_USE_NEW_THREAD_POOL=1 run_bench " ${LOG1_TP} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50
61+ DALI_USE_NEW_THREAD_POOL=1 run_bench " ${LOG2_TP} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50 --experimental_decoder
62+ run_bench " ${LOG1_NDD} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50
63+ run_bench " ${LOG2_NDD} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50 --experimental_decoder
64+ NVIMGCODEC_DEFAULT_NUM_CUDA_STREAMS=32 run_bench " ${LOG2_32STREAMS} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50 --experimental_decoder
65+ NVIMGCODEC_DEFAULT_NUM_CUDA_STREAMS=32 run_bench " ${LOG2_NDD_32STREAMS} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50 --experimental_decoder
66+ }
67+
2268test_body () {
23- # SPEC:DA-11356-002_v04
24- if [ " $( uname -p) " == " x86_64" ]; then
25- # Hopper
26- MIN_PERF=19000;
27- MIN_PERF2=18000; # TODO(janton): target is to be 19000 as well
28- MIN_PERF_NDD=14000;
29- MIN_PERF2_NDD=14000; # TODO(janton): remove this second value.
30- # use taskset to avoid inefficient data migration between cores we don't want to use
31- taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG1}
32- taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2}
33- DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG1_TP}
34- DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2_TP}
35- taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p ndd_rn50 -j 70 --hw_load 0.12 | tee ${LOG1_NDD}
36- taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p ndd_rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2_NDD}
69+ if [ " $( uname -p) " == " x86_64" ]; then
70+ MIN_PERF=19000
71+ MIN_PERF2=18000 # TODO(janton): target is to be 19000 as well
72+ MIN_PERF_NDD=14000
73+ MIN_PERF2_NDD=14000 # TODO(janton): remove this second value.
74+ else
75+ MIN_PERF=29000
76+ MIN_PERF2=29000 # TODO(janton): remove this second value.
77+ MIN_PERF_NDD=20000
78+ MIN_PERF2_NDD=20000 # TODO(janton): remove this second value.
79+ fi
3780
38- else
39- # GraceHopper
40- MIN_PERF=29000;
41- MIN_PERF2=29000; # TODO(janton): remove this second value.
42- MIN_PERF_NDD=20000;
43- MIN_PERF2_NDD=20000; # TODO(janton): remove this second value.
44- # use taskset to avoid inefficient data migration between cores we don't want to use
45- taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG1}
46- taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2}
47- DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG1_TP}
48- DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2_TP}
49- taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p ndd_rn50 -j 72 --hw_load 0.11 | tee ${LOG1_NDD}
50- taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p ndd_rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2_NDD}
51- fi
81+ # First run: all benchmarks without nsys
82+ unset NSYS_REP
83+ run_all_benchmarks
5284
53- # Regex Explanation:
54- # Total Throughput: : Matches the literal string "Total Throughput: ".
55- # \K: Resets the start of the match, so anything before \K is not included in the output.
56- # [0-9]+(\.[0-9]+)?: Matches the number, with an optional decimal part.
57- # (?= frames/sec): ensures " frames/sec" follows the number, but doesn't include it.
58- extract_perf () {
59- log_file=" $1 "
60- grep -oP ' Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' " ${log_file} "
61- }
85+ # Regex: extract "Total Throughput: X frames/sec" -> X
86+ extract_perf () {
87+ grep -oP ' Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' " $1 "
88+ }
6289
90+ perf_check () {
91+ local value=$( extract_perf " $1 " )
92+ local min_value=$2
93+ local percent=${3:- 0}
94+ local tolerance=$( awk -v p=" $percent " ' BEGIN{print p/100}' )
95+ echo " $value $min_value " | awk -v tol=" $tolerance " ' {
96+ lower = $2 * (1 - tol);
97+ if ($1 >= lower) {print "OK"} else {print "FAIL"}
98+ }'
99+ }
63100
64- perf_check () {
65- # Checks if the extracted performance value from the specified log file
66- # is within a given percentage tolerance of a minimum threshold.
101+ PERF_RESULT1=$( perf_check " ${LOG1} " " $MIN_PERF " )
102+ PERF_RESULT2=$( perf_check " ${LOG2} " " $MIN_PERF2 " )
103+ PERF_RESULT1_NDD=$( perf_check " ${LOG1_NDD} " " $MIN_PERF_NDD " )
104+ PERF_RESULT2_NDD=$( perf_check " ${LOG2_NDD} " " $MIN_PERF2_NDD " )
105+ PERF_RESULT3=$( perf_check " ${LOG2} " " $( extract_perf " ${LOG1} " ) " 5)
106+ PERF_RESULT3_NDD=$( perf_check " ${LOG2_NDD} " " $( extract_perf " ${LOG1_NDD} " ) " 5)
107+ PERF_RESULT1_TP=$( perf_check " ${LOG1_TP} " " $( extract_perf " ${LOG1} " ) " 2)
108+ PERF_RESULT2_TP=$( perf_check " ${LOG2_TP} " " $( extract_perf " ${LOG2} " ) " 2)
109+ PERF_RESULT2_32STREAMS=$( perf_check " ${LOG2_32STREAMS} " " $MIN_PERF2 " )
110+ PERF_RESULT2_NDD_32STREAMS=$( perf_check " ${LOG2_NDD_32STREAMS} " " $MIN_PERF2_NDD " )
111+ PERF_RESULT3_32STREAMS=$( perf_check " ${LOG2_32STREAMS} " " $( extract_perf " ${LOG1} " ) " 5)
112+ PERF_RESULT3_NDD_32STREAMS=$( perf_check " ${LOG2_NDD_32STREAMS} " " $( extract_perf " ${LOG1_NDD} " ) " 5)
67113
68- # Args:
69- # $1: The log file to extract the throughput value from.
70- # $2: The minimum threshold value to compare against.
71- # $3: (Optional) Percent tolerance. If specified, allows value to be
72- # within $2 * (1 - percent/100). Defaults to 0.
114+ echo " PERF_RESULT1=${PERF_RESULT1} "
115+ echo " PERF_RESULT2=${PERF_RESULT2} "
116+ echo " PERF_RESULT3=${PERF_RESULT3} "
117+ echo " PERF_RESULT1_TP=${PERF_RESULT1_TP} "
118+ echo " PERF_RESULT2_TP=${PERF_RESULT2_TP} "
119+ echo " PERF_RESULT1_NDD=${PERF_RESULT1_NDD} "
120+ echo " PERF_RESULT2_NDD=${PERF_RESULT2_NDD} "
121+ echo " PERF_RESULT3_NDD=${PERF_RESULT3_NDD} "
122+ echo " PERF_RESULT2_32STREAMS=${PERF_RESULT2_32STREAMS} "
123+ echo " PERF_RESULT2_NDD_32STREAMS=${PERF_RESULT2_NDD_32STREAMS} "
124+ echo " PERF_RESULT3_32STREAMS=${PERF_RESULT3_32STREAMS} "
125+ echo " PERF_RESULT3_NDD_32STREAMS=${PERF_RESULT3_NDD_32STREAMS} "
73126
74- # Returns:
75- # Prints "OK" if value >= min_value*(1-tolerance), "FAIL" otherwise.
127+ # don't check experimental decoder performance with dynamic mode (PERF_RESULT2_NDD, PERF_RESULT3_NDD)
128+ if [[ " $PERF_RESULT1 " == " OK" && " $PERF_RESULT2 " == " OK" && " $PERF_RESULT1_TP " == " OK" && " $PERF_RESULT2_TP " == " OK" && " $PERF_RESULT3 " == " OK" && " $PERF_RESULT1_NDD " == " OK" && " $PERF_RESULT2_32STREAMS " == " OK" && " $PERF_RESULT3_32STREAMS " == " OK" && " $PERF_RESULT2_NDD_32STREAMS " == " OK" && " $PERF_RESULT3_NDD_32STREAMS " == " OK" ]]; then
129+ CLEAN_AND_EXIT 0
130+ fi
76131
77- local value=$( extract_perf " $1 " )
78- local min_value=$2
79- local percent=${3:- 0}
80- # Check if value is within percent% of min_value below
81- local tolerance=$( awk -v p=" $percent " ' BEGIN{print p/100}' )
82- echo " $value $min_value " | awk -v tol=" $tolerance " ' {
83- lower = $2 * (1 - tol);
84- if ($1 >= lower) {print "OK"} else {print "FAIL"}
85- }'
86- }
87- PERF_RESULT1=$( perf_check " ${LOG1} " " $MIN_PERF " )
88- PERF_RESULT2=$( perf_check " ${LOG2} " " $MIN_PERF2 " )
89- PERF_RESULT1_NDD=$( perf_check " ${LOG1_NDD} " " $MIN_PERF_NDD " )
90- PERF_RESULT2_NDD=$( perf_check " ${LOG2_NDD} " " $MIN_PERF2_NDD " )
91- PERF_RESULT3=$( perf_check " ${LOG2} " " $( extract_perf " ${LOG1} " ) " 5)
92- PERF_RESULT3_NDD=$( perf_check " ${LOG2_NDD} " " $( extract_perf " ${LOG1_NDD} " ) " 5)
93- PERF_RESULT1_TP=$( perf_check " ${LOG1_TP} " " $( extract_perf " ${LOG1} " ) " 2)
94- PERF_RESULT2_TP=$( perf_check " ${LOG2_TP} " " $( extract_perf " ${LOG2} " ) " 2)
132+ # On failure: re-run all benchmarks with nsys and save profiles to core_artifacts
133+ echo " Performance check failed; re-running all benchmarks with nsys profiling..."
134+ ARTIFACTS_DIR=" ${topdir} /core_artifacts"
135+ mkdir -p " ${ARTIFACTS_DIR} "
95136
96- echo " PERF_RESULT1=${PERF_RESULT1} "
97- echo " PERF_RESULT2=${PERF_RESULT2} "
98- echo " PERF_RESULT3=${PERF_RESULT3} "
99- echo " PERF_RESULT1_TP=${PERF_RESULT1_TP} "
100- echo " PERF_RESULT2_TP=${PERF_RESULT2_TP} "
101- echo " PERF_RESULT1_NDD=${PERF_RESULT1_NDD} "
102- echo " PERF_RESULT2_NDD=${PERF_RESULT2_NDD} "
103- echo " PERF_RESULT3_NDD=${PERF_RESULT3_NDD} "
137+ if [ " $( uname -p) " == " x86_64" ]; then
138+ TASKSET=" taskset --cpu-list 0-127"
139+ BENCH_ARGS=" --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -j 70 --hw_load 0.12"
140+ else
141+ TASKSET=" taskset --cpu-list 0-71"
142+ BENCH_ARGS=" --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -j 72 --hw_load 0.11"
143+ fi
144+ NSYS_REP=" decoder_perf_legacy.nsys-rep" run_bench " ${LOG1} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50
145+ NSYS_REP=" decoder_perf_nvimgcodec.nsys-rep" run_bench " ${LOG2} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50 --experimental_decoder
146+ NSYS_REP=" decoder_perf_ndd_legacy.nsys-rep" run_bench " ${LOG1_NDD} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50
147+ NSYS_REP=" decoder_perf_ndd_nvimgcodec.nsys-rep" run_bench " ${LOG2_NDD} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50 --experimental_decoder
104148
105- # if [[ "$PERF_RESULT1" == "OK" && "$PERF_RESULT2" == "OK" && "$PERF_RESULT3" == "OK" && "$PERF_RESULT1_NDD" == "OK" && "$PERF_RESULT2_NDD" == "OK" && "$PERF_RESULT3_NDD" == "OK" ]]; then
106- # don't check experimental decoder performance with dynamic mode
107- if [[ " $PERF_RESULT1 " == " OK" && " $PERF_RESULT2 " == " OK" && " $PERF_RESULT1_TP " == " OK" && " $PERF_RESULT2_TP " == " OK" && " $PERF_RESULT3 " == " OK" && " $PERF_RESULT1_NDD " == " OK" ]]; then
108- CLEAN_AND_EXIT 0
109- else
149+ cp -f * .nsys-rep " ${ARTIFACTS_DIR} /" 2> /dev/null || true
150+ echo " nsys profiles saved to ${ARTIFACTS_DIR} "
110151 CLEAN_AND_EXIT 1
111- fi
112152}
113153pushd ../..
114154source ./qa/test_template.sh
115- popd
155+ popd
0 commit comments