33pip_packages=' numpy'
44target_dir=./internal_tools
55
6+ # One-time pre-step: install nsys (NVIDIA Nsight Systems) if not present
7+ do_once () {
8+ if command -v nsys & > /dev/null; then
9+ echo " nsys already installed: $( nsys --version) "
10+ return
11+ fi
12+ DISTRO=$( source /etc/lsb-release && echo " $DISTRIB_RELEASE " | tr -d .)
13+ ARCH=$( dpkg --print-architecture)
14+ apt update && apt install -y --no-install-recommends gnupg curl
15+ echo " deb [signed-by=/usr/share/keyrings/nvidia-devtools.gpg] https://developer.download.nvidia.com/devtools/repos/ubuntu${DISTRO} /${ARCH} /" \
16+ | tee /etc/apt/sources.list.d/nvidia-devtools.list
17+ curl -fsSL " https://developer.download.nvidia.com/devtools/repos/ubuntu${DISTRO} /${ARCH} /nvidia.pub" \
18+ | gpg --dearmor -o /usr/share/keyrings/nvidia-devtools.gpg
19+ apt update && apt install -y nsight-systems-cli
20+ }
21+
622LOG1=" dali_legacy.log"
723LOG2=" dali_nvimgcodec.log"
824LOG1_TP=" dali_legacy_new_tp.log"
925LOG2_TP=" dali_nvimgcodec_new_tp.log"
1026LOG1_NDD=" dali_ndd_legacy.log"
1127LOG2_NDD=" dali_ndd_nvimgcodec.log"
28+
1229function CLEAN_AND_EXIT {
1330 rm -rf ${LOG1}
1431 rm -rf ${LOG2}
@@ -19,97 +36,106 @@ function CLEAN_AND_EXIT {
1936 exit $1
2037}
2138
39+ # Run a single benchmark; if NSYS_REP is set, wrap with nsys profiling.
40+ # The profile filename is derived from the log filename (e.g. foo.log -> foo.nsys-rep).
41+ run_bench () {
42+ local log_file=" $1 "
43+ shift
44+ if [ -n " ${NSYS_REP} " ]; then
45+ local profile_name=" ${log_file% .log} .nsys-rep"
46+ nsys profile -o " ${profile_name} " --stats=true " $@ " | tee " ${log_file} "
47+ else
48+ " $@ " | tee " ${log_file} "
49+ fi
50+ }
51+
52+ # SPEC:DA-11356-002_v04 - run all benchmarks (optionally with nsys when NSYS_REP is set per run).
53+ run_all_benchmarks () {
54+ if [ " $( uname -p) " == " x86_64" ]; then
55+ # Hopper
56+ TASKSET=" taskset --cpu-list 0-127"
57+ BENCH_ARGS=" --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -j 70 --hw_load 0.12"
58+ else
59+ # GraceHopper
60+ TASKSET=" taskset --cpu-list 0-71"
61+ BENCH_ARGS=" --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -j 72 --hw_load 0.11"
62+ fi
63+ run_bench " ${LOG1} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50
64+ run_bench " ${LOG2} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50 --experimental_decoder
65+ DALI_USE_NEW_THREAD_POOL=1 run_bench " ${LOG1_TP} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50
66+ DALI_USE_NEW_THREAD_POOL=1 run_bench " ${LOG2_TP} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50 --experimental_decoder
67+ run_bench " ${LOG1_NDD} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50
68+ run_bench " ${LOG2_NDD} " ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50 --experimental_decoder
69+ }
70+
2271test_body () {
23- # SPEC:DA-11356-002_v04
24- if [ " $( uname -p) " == " x86_64" ]; then
25- # Hopper
26- MIN_PERF=19000;
27- MIN_PERF2=18000; # TODO(janton): target is to be 19000 as well
28- MIN_PERF_NDD=14000;
29- MIN_PERF2_NDD=14000; # TODO(janton): remove this second value.
30- # use taskset to avoid inefficient data migration between cores we don't want to use
31- taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG1}
32- taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2}
33- DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG1_TP}
34- DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2_TP}
35- taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p ndd_rn50 -j 70 --hw_load 0.12 | tee ${LOG1_NDD}
36- taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p ndd_rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2_NDD}
72+ if [ " $( uname -p) " == " x86_64" ]; then
73+ MIN_PERF=19000
74+ MIN_PERF2=18000 # TODO(janton): target is to be 19000 as well
75+ MIN_PERF_NDD=14000
76+ MIN_PERF2_NDD=14000 # TODO(janton): remove this second value.
77+ else
78+ MIN_PERF=29000
79+ MIN_PERF2=29000 # TODO(janton): remove this second value.
80+ MIN_PERF_NDD=20000
81+ MIN_PERF2_NDD=20000 # TODO(janton): remove this second value.
82+ fi
3783
38- else
39- # GraceHopper
40- MIN_PERF=29000;
41- MIN_PERF2=29000; # TODO(janton): remove this second value.
42- MIN_PERF_NDD=20000;
43- MIN_PERF2_NDD=20000; # TODO(janton): remove this second value.
44- # use taskset to avoid inefficient data migration between cores we don't want to use
45- taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG1}
46- taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2}
47- DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG1_TP}
48- DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2_TP}
49- taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p ndd_rn50 -j 72 --hw_load 0.11 | tee ${LOG1_NDD}
50- taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH} /db/single/jpeg -p ndd_rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2_NDD}
51- fi
84+ # First run: all benchmarks without nsys
85+ unset NSYS_REP
86+ run_all_benchmarks
5287
53- # Regex Explanation:
54- # Total Throughput: : Matches the literal string "Total Throughput: ".
55- # \K: Resets the start of the match, so anything before \K is not included in the output.
56- # [0-9]+(\.[0-9]+)?: Matches the number, with an optional decimal part.
57- # (?= frames/sec): ensures " frames/sec" follows the number, but doesn't include it.
58- extract_perf () {
59- log_file=" $1 "
60- grep -oP ' Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' " ${log_file} "
61- }
88+ # Regex: extract "Total Throughput: X frames/sec" -> X
89+ extract_perf () {
90+ grep -oP ' Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' " $1 "
91+ }
6292
93+ perf_check () {
94+ local value=$( extract_perf " $1 " )
95+ local min_value=$2
96+ local percent=${3:- 0}
97+ local tolerance=$( awk -v p=" $percent " ' BEGIN{print p/100}' )
98+ echo " $value $min_value " | awk -v tol=" $tolerance " ' {
99+ lower = $2 * (1 - tol);
100+ if ($1 >= lower) {print "OK"} else {print "FAIL"}
101+ }'
102+ }
63103
64- perf_check () {
65- # Checks if the extracted performance value from the specified log file
66- # is within a given percentage tolerance of a minimum threshold.
104+ PERF_RESULT1=$( perf_check " ${LOG1} " " $MIN_PERF " )
105+ PERF_RESULT2=$( perf_check " ${LOG2} " " $MIN_PERF2 " )
106+ PERF_RESULT1_NDD=$( perf_check " ${LOG1_NDD} " " $MIN_PERF_NDD " )
107+ PERF_RESULT2_NDD=$( perf_check " ${LOG2_NDD} " " $MIN_PERF2_NDD " )
108+ PERF_RESULT3=$( perf_check " ${LOG2} " " $( extract_perf " ${LOG1} " ) " 5)
109+ PERF_RESULT3_NDD=$( perf_check " ${LOG2_NDD} " " $( extract_perf " ${LOG1_NDD} " ) " 5)
110+ PERF_RESULT1_TP=$( perf_check " ${LOG1_TP} " " $( extract_perf " ${LOG1} " ) " 2)
111+ PERF_RESULT2_TP=$( perf_check " ${LOG2_TP} " " $( extract_perf " ${LOG2} " ) " 2)
67112
68- # Args:
69- # $1: The log file to extract the throughput value from.
70- # $2: The minimum threshold value to compare against.
71- # $3: (Optional) Percent tolerance. If specified, allows value to be
72- # within $2 * (1 - percent/100). Defaults to 0.
113+ echo " PERF_RESULT1=${PERF_RESULT1} "
114+ echo " PERF_RESULT2=${PERF_RESULT2} "
115+ echo " PERF_RESULT3=${PERF_RESULT3} "
116+ echo " PERF_RESULT1_TP=${PERF_RESULT1_TP} (informational)"
117+ echo " PERF_RESULT2_TP=${PERF_RESULT2_TP} (informational)"
118+ echo " PERF_RESULT1_NDD=${PERF_RESULT1_NDD} "
119+ echo " PERF_RESULT2_NDD=${PERF_RESULT2_NDD} "
120+ echo " PERF_RESULT3_NDD=${PERF_RESULT3_NDD} "
73121
74- # Returns:
75- # Prints "OK" if value >= min_value*(1-tolerance), "FAIL" otherwise.
122+ # don't check experimental decoder performance with dynamic mode (PERF_RESULT2_NDD, PERF_RESULT3_NDD)
123+ # PERF_RESULT1_TP and PERF_RESULT2_TP are informational only (new thread pool is experimental)
124+ if [[ " $PERF_RESULT1 " == " OK" && " $PERF_RESULT2 " == " OK" && " $PERF_RESULT3 " == " OK" && " $PERF_RESULT1_NDD " == " OK" ]]; then
125+ CLEAN_AND_EXIT 0
126+ fi
76127
77- local value=$( extract_perf " $1 " )
78- local min_value=$2
79- local percent=${3:- 0}
80- # Check if value is within percent% of min_value below
81- local tolerance=$( awk -v p=" $percent " ' BEGIN{print p/100}' )
82- echo " $value $min_value " | awk -v tol=" $tolerance " ' {
83- lower = $2 * (1 - tol);
84- if ($1 >= lower) {print "OK"} else {print "FAIL"}
85- }'
86- }
87- PERF_RESULT1=$( perf_check " ${LOG1} " " $MIN_PERF " )
88- PERF_RESULT2=$( perf_check " ${LOG2} " " $MIN_PERF2 " )
89- PERF_RESULT1_NDD=$( perf_check " ${LOG1_NDD} " " $MIN_PERF_NDD " )
90- PERF_RESULT2_NDD=$( perf_check " ${LOG2_NDD} " " $MIN_PERF2_NDD " )
91- PERF_RESULT3=$( perf_check " ${LOG2} " " $( extract_perf " ${LOG1} " ) " 5)
92- PERF_RESULT3_NDD=$( perf_check " ${LOG2_NDD} " " $( extract_perf " ${LOG1_NDD} " ) " 5)
93- PERF_RESULT1_TP=$( perf_check " ${LOG1_TP} " " $( extract_perf " ${LOG1} " ) " 2)
94- PERF_RESULT2_TP=$( perf_check " ${LOG2_TP} " " $( extract_perf " ${LOG2} " ) " 2)
128+ # On failure: re-run all benchmarks with nsys and save profiles to core_artifacts
129+ echo " Performance check failed; re-running all benchmarks with nsys profiling..."
130+ ARTIFACTS_DIR=" ${topdir} /core_artifacts"
131+ mkdir -p " ${ARTIFACTS_DIR} "
95132
96- echo " PERF_RESULT1=${PERF_RESULT1} "
97- echo " PERF_RESULT2=${PERF_RESULT2} "
98- echo " PERF_RESULT3=${PERF_RESULT3} "
99- echo " PERF_RESULT1_TP=${PERF_RESULT1_TP} "
100- echo " PERF_RESULT2_TP=${PERF_RESULT2_TP} "
101- echo " PERF_RESULT1_NDD=${PERF_RESULT1_NDD} "
102- echo " PERF_RESULT2_NDD=${PERF_RESULT2_NDD} "
103- echo " PERF_RESULT3_NDD=${PERF_RESULT3_NDD} "
133+ NSYS_REP=" enabled" run_all_benchmarks
104134
105- # if [[ "$PERF_RESULT1" == "OK" && "$PERF_RESULT2" == "OK" && "$PERF_RESULT3" == "OK" && "$PERF_RESULT1_NDD" == "OK" && "$PERF_RESULT2_NDD" == "OK" && "$PERF_RESULT3_NDD" == "OK" ]]; then
106- # don't check experimental decoder performance with dynamic mode
107- if [[ " $PERF_RESULT1 " == " OK" && " $PERF_RESULT2 " == " OK" && " $PERF_RESULT1_TP " == " OK" && " $PERF_RESULT2_TP " == " OK" && " $PERF_RESULT3 " == " OK" && " $PERF_RESULT1_NDD " == " OK" ]]; then
108- CLEAN_AND_EXIT 0
109- else
135+ cp -f * .nsys-rep " ${ARTIFACTS_DIR} /" 2> /dev/null || true
136+ echo " nsys profiles saved to ${ARTIFACTS_DIR} "
110137 CLEAN_AND_EXIT 1
111- fi
112138}
113139pushd ../..
114140source ./qa/test_template.sh
115- popd
141+ popd
0 commit comments