Skip to content

Commit 32768d6

Browse files
authored
Adapt DALI to nvImageCodec 0.8.0 (#6293)
* Adapt DALI to nvImageCodec 0.8.0 - Bump required nvImageCodec version range from 0.7.x to 0.8.x - Update nvimgcodec download URL and hash to 0.8.0.22 - Update nvimgcodecCodeStreamCreateFromHostMem call to match the new 0.8.0 API signature (added nullable parameter) - Rework qa/TL1_decoder_perf/test.sh to collect nsys profiles on failure for easier debugging * Relax TL1_decoder_perf: make thread-pool checks informational, drop 32-stream benchmarks PERF_RESULT1_TP and PERF_RESULT2_TP are now informational only since the new thread pool is still experimental. The 32-stream benchmark variants are removed as they are no longer needed. * Update decoder performance test script
1 parent 491b193 commit 32768d6

4 files changed

Lines changed: 114 additions & 88 deletions

File tree

cmake/Dependencies.common.cmake

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,8 @@ endif()
297297
##################################################################
298298
set(DALI_INSTALL_REQUIRES_NVIMGCODEC "")
299299
if(BUILD_NVIMAGECODEC)
300-
set(NVIMGCODEC_MIN_VERSION "0.7.0")
301-
set(NVIMGCODEC_MAX_VERSION "0.8.0")
300+
set(NVIMGCODEC_MIN_VERSION "0.8.0")
301+
set(NVIMGCODEC_MAX_VERSION "0.9.0")
302302
message(STATUS "nvImageCodec - requires version >=${NVIMGCODEC_MIN_VERSION}, <${NVIMGCODEC_MAX_VERSION}")
303303
if (WITH_DYNAMIC_NVIMGCODEC)
304304
message(STATUS "nvImageCodec - dynamic load")
@@ -315,8 +315,8 @@ if(BUILD_NVIMAGECODEC)
315315
include(FetchContent)
316316
FetchContent_Declare(
317317
nvimgcodec_headers
318-
URL https://developer.download.nvidia.com/compute/nvimgcodec/redist/nvimgcodec/linux-x86_64/nvimgcodec-linux-x86_64-0.7.0.11-archive.tar.xz
319-
URL_HASH SHA512=0777af0a41500de7aaeffb6966b3da20271f807c6af106307b9759854c082d5b6f850c0455b011b8978fc5954514bb46dbd5da0904d471309adf9fdfbaf7dd98
318+
URL https://developer.download.nvidia.com/compute/nvimgcodec/redist/nvimgcodec/linux-x86_64/nvimgcodec-linux-x86_64-0.8.0.22-archive.tar.xz
319+
URL_HASH SHA512=2a400f75c619a10c3dbcd298a83ef3307f6e08453b2cfb5040f6b22c64c7be0ac4552a2a80ed057afe7657cf0bb8cc2d54cdccf8bc50ffdf34cfd05b45082978
320320
)
321321
FetchContent_Populate(nvimgcodec_headers)
322322
set(nvimgcodec_INCLUDE_DIR "${nvimgcodec_headers_SOURCE_DIR}/${CUDA_VERSION_MAJOR}/include")
@@ -357,7 +357,7 @@ if(BUILD_NVIMAGECODEC)
357357
ExternalProject_Add(
358358
nvImageCodec
359359
GIT_REPOSITORY https://github.com/NVIDIA/nvImageCodec.git
360-
GIT_TAG v0.7.0
360+
GIT_TAG v0.8.0
361361
GIT_SUBMODULES "external/pybind11"
362362
"external/NVTX"
363363
"external/googletest"

conda/third_party/dali_nvimagecodec/recipe/meta.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313
# limitations under the License.
1414

1515

16-
{% set build_version = "0.7.0" %}
16+
{% set build_version = "0.8.0" %}
1717

1818
package:
1919
name: nvidia-nvimagecodec-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }}
2020
version: {{ build_version }}
2121

2222
source:
2323
git_url: https://github.com/NVIDIA/nvImageCodec.git
24-
git_rev: v0.7.0
24+
git_rev: v0.8.0
2525

2626
build:
2727
number: 0

dali/operators/imgcodec/util/nvimagecodec_types.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ NvImageCodecCodeStream NvImageCodecCodeStream::FromHostMem(nvimgcodecInstance_t
4444
const void *data, size_t length) {
4545
NvImageCodecCodeStream ret;
4646
CHECK_NVIMGCODEC(nvimgcodecCodeStreamCreateFromHostMem(
47-
instance, &ret.handle_, static_cast<const unsigned char*>(data), length));
47+
instance, &ret.handle_, static_cast<const unsigned char*>(data), length, nullptr));
4848
return ret;
4949
}
5050

qa/TL1_decoder_perf/test.sh

Lines changed: 106 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,29 @@
33
pip_packages='numpy'
44
target_dir=./internal_tools
55

6+
# One-time pre-step: install nsys (NVIDIA Nsight Systems) if not present
7+
do_once() {
8+
if command -v nsys &>/dev/null; then
9+
echo "nsys already installed: $(nsys --version)"
10+
return
11+
fi
12+
DISTRO=$(source /etc/lsb-release && echo "$DISTRIB_RELEASE" | tr -d .)
13+
ARCH=$(dpkg --print-architecture)
14+
apt update && apt install -y --no-install-recommends gnupg curl
15+
echo "deb [signed-by=/usr/share/keyrings/nvidia-devtools.gpg] https://developer.download.nvidia.com/devtools/repos/ubuntu${DISTRO}/${ARCH} /" \
16+
| tee /etc/apt/sources.list.d/nvidia-devtools.list
17+
curl -fsSL "https://developer.download.nvidia.com/devtools/repos/ubuntu${DISTRO}/${ARCH}/nvidia.pub" \
18+
| gpg --dearmor -o /usr/share/keyrings/nvidia-devtools.gpg
19+
apt update && apt install -y nsight-systems-cli
20+
}
21+
622
LOG1="dali_legacy.log"
723
LOG2="dali_nvimgcodec.log"
824
LOG1_TP="dali_legacy_new_tp.log"
925
LOG2_TP="dali_nvimgcodec_new_tp.log"
1026
LOG1_NDD="dali_ndd_legacy.log"
1127
LOG2_NDD="dali_ndd_nvimgcodec.log"
28+
1229
function CLEAN_AND_EXIT {
1330
rm -rf ${LOG1}
1431
rm -rf ${LOG2}
@@ -19,97 +36,106 @@ function CLEAN_AND_EXIT {
1936
exit $1
2037
}
2138

39+
# Run a single benchmark; if NSYS_REP is set, wrap with nsys profiling.
40+
# The profile filename is derived from the log filename (e.g. foo.log -> foo.nsys-rep).
41+
run_bench() {
42+
local log_file="$1"
43+
shift
44+
if [ -n "${NSYS_REP}" ]; then
45+
local profile_name="${log_file%.log}.nsys-rep"
46+
nsys profile -o "${profile_name}" --stats=true "$@" | tee "${log_file}"
47+
else
48+
"$@" | tee "${log_file}"
49+
fi
50+
}
51+
52+
# SPEC:DA-11356-002_v04 - run all benchmarks (optionally with nsys when NSYS_REP is set per run).
53+
run_all_benchmarks() {
54+
if [ "$(uname -p)" == "x86_64" ]; then
55+
# Hopper
56+
TASKSET="taskset --cpu-list 0-127"
57+
BENCH_ARGS="--width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -j 70 --hw_load 0.12"
58+
else
59+
# GraceHopper
60+
TASKSET="taskset --cpu-list 0-71"
61+
BENCH_ARGS="--width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -j 72 --hw_load 0.11"
62+
fi
63+
run_bench "${LOG1}" ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50
64+
run_bench "${LOG2}" ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50 --experimental_decoder
65+
DALI_USE_NEW_THREAD_POOL=1 run_bench "${LOG1_TP}" ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50
66+
DALI_USE_NEW_THREAD_POOL=1 run_bench "${LOG2_TP}" ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p rn50 --experimental_decoder
67+
run_bench "${LOG1_NDD}" ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50
68+
run_bench "${LOG2_NDD}" ${TASKSET} python hw_decoder_bench.py ${BENCH_ARGS} -p ndd_rn50 --experimental_decoder
69+
}
70+
2271
test_body() {
23-
# SPEC:DA-11356-002_v04
24-
if [ "$(uname -p)" == "x86_64" ]; then
25-
# Hopper
26-
MIN_PERF=19000;
27-
MIN_PERF2=18000; # TODO(janton): target is to be 19000 as well
28-
MIN_PERF_NDD=14000;
29-
MIN_PERF2_NDD=14000; # TODO(janton): remove this second value.
30-
# use taskset to avoid inefficient data migration between cores we don't want to use
31-
taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG1}
32-
taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2}
33-
DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG1_TP}
34-
DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2_TP}
35-
taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p ndd_rn50 -j 70 --hw_load 0.12 | tee ${LOG1_NDD}
36-
taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p ndd_rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2_NDD}
72+
if [ "$(uname -p)" == "x86_64" ]; then
73+
MIN_PERF=19000
74+
MIN_PERF2=18000 # TODO(janton): target is to be 19000 as well
75+
MIN_PERF_NDD=14000
76+
MIN_PERF2_NDD=14000 # TODO(janton): remove this second value.
77+
else
78+
MIN_PERF=29000
79+
MIN_PERF2=29000 # TODO(janton): remove this second value.
80+
MIN_PERF_NDD=20000
81+
MIN_PERF2_NDD=20000 # TODO(janton): remove this second value.
82+
fi
3783

38-
else
39-
# GraceHopper
40-
MIN_PERF=29000;
41-
MIN_PERF2=29000; # TODO(janton): remove this second value.
42-
MIN_PERF_NDD=20000;
43-
MIN_PERF2_NDD=20000; # TODO(janton): remove this second value.
44-
# use taskset to avoid inefficient data migration between cores we don't want to use
45-
taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG1}
46-
taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2}
47-
DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG1_TP}
48-
DALI_USE_NEW_THREAD_POOL=1 taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2_TP}
49-
taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p ndd_rn50 -j 72 --hw_load 0.11 | tee ${LOG1_NDD}
50-
taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p ndd_rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2_NDD}
51-
fi
84+
# First run: all benchmarks without nsys
85+
unset NSYS_REP
86+
run_all_benchmarks
5287

53-
# Regex Explanation:
54-
# Total Throughput: : Matches the literal string "Total Throughput: ".
55-
# \K: Resets the start of the match, so anything before \K is not included in the output.
56-
# [0-9]+(\.[0-9]+)?: Matches the number, with an optional decimal part.
57-
# (?= frames/sec): ensures " frames/sec" follows the number, but doesn't include it.
58-
extract_perf() {
59-
log_file="$1"
60-
grep -oP 'Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' "${log_file}"
61-
}
88+
# Regex: extract "Total Throughput: X frames/sec" -> X
89+
extract_perf() {
90+
grep -oP 'Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' "$1"
91+
}
6292

93+
perf_check() {
94+
local value=$(extract_perf "$1")
95+
local min_value=$2
96+
local percent=${3:-0}
97+
local tolerance=$(awk -v p="$percent" 'BEGIN{print p/100}')
98+
echo "$value $min_value" | awk -v tol="$tolerance" '{
99+
lower = $2 * (1 - tol);
100+
if ($1 >= lower) {print "OK"} else {print "FAIL"}
101+
}'
102+
}
63103

64-
perf_check() {
65-
# Checks if the extracted performance value from the specified log file
66-
# is within a given percentage tolerance of a minimum threshold.
104+
PERF_RESULT1=$(perf_check "${LOG1}" "$MIN_PERF")
105+
PERF_RESULT2=$(perf_check "${LOG2}" "$MIN_PERF2")
106+
PERF_RESULT1_NDD=$(perf_check "${LOG1_NDD}" "$MIN_PERF_NDD")
107+
PERF_RESULT2_NDD=$(perf_check "${LOG2_NDD}" "$MIN_PERF2_NDD")
108+
PERF_RESULT3=$(perf_check "${LOG2}" "$(extract_perf "${LOG1}")" 5)
109+
PERF_RESULT3_NDD=$(perf_check "${LOG2_NDD}" "$(extract_perf "${LOG1_NDD}")" 5)
110+
PERF_RESULT1_TP=$(perf_check "${LOG1_TP}" "$(extract_perf "${LOG1}")" 2)
111+
PERF_RESULT2_TP=$(perf_check "${LOG2_TP}" "$(extract_perf "${LOG2}")" 2)
67112

68-
# Args:
69-
# $1: The log file to extract the throughput value from.
70-
# $2: The minimum threshold value to compare against.
71-
# $3: (Optional) Percent tolerance. If specified, allows value to be
72-
# within $2 * (1 - percent/100). Defaults to 0.
113+
echo "PERF_RESULT1=${PERF_RESULT1}"
114+
echo "PERF_RESULT2=${PERF_RESULT2}"
115+
echo "PERF_RESULT3=${PERF_RESULT3}"
116+
echo "PERF_RESULT1_TP=${PERF_RESULT1_TP} (informational)"
117+
echo "PERF_RESULT2_TP=${PERF_RESULT2_TP} (informational)"
118+
echo "PERF_RESULT1_NDD=${PERF_RESULT1_NDD}"
119+
echo "PERF_RESULT2_NDD=${PERF_RESULT2_NDD}"
120+
echo "PERF_RESULT3_NDD=${PERF_RESULT3_NDD}"
73121

74-
# Returns:
75-
# Prints "OK" if value >= min_value*(1-tolerance), "FAIL" otherwise.
122+
# don't check experimental decoder performance with dynamic mode (PERF_RESULT2_NDD, PERF_RESULT3_NDD)
123+
# PERF_RESULT1_TP and PERF_RESULT2_TP are informational only (new thread pool is experimental)
124+
if [[ "$PERF_RESULT1" == "OK" && "$PERF_RESULT2" == "OK" && "$PERF_RESULT3" == "OK" && "$PERF_RESULT1_NDD" == "OK" ]]; then
125+
CLEAN_AND_EXIT 0
126+
fi
76127

77-
local value=$(extract_perf "$1")
78-
local min_value=$2
79-
local percent=${3:-0}
80-
# Check if value is within percent% of min_value below
81-
local tolerance=$(awk -v p="$percent" 'BEGIN{print p/100}')
82-
echo "$value $min_value" | awk -v tol="$tolerance" '{
83-
lower = $2 * (1 - tol);
84-
if ($1 >= lower) {print "OK"} else {print "FAIL"}
85-
}'
86-
}
87-
PERF_RESULT1=$(perf_check "${LOG1}" "$MIN_PERF")
88-
PERF_RESULT2=$(perf_check "${LOG2}" "$MIN_PERF2")
89-
PERF_RESULT1_NDD=$(perf_check "${LOG1_NDD}" "$MIN_PERF_NDD")
90-
PERF_RESULT2_NDD=$(perf_check "${LOG2_NDD}" "$MIN_PERF2_NDD")
91-
PERF_RESULT3=$(perf_check "${LOG2}" "$(extract_perf "${LOG1}")" 5)
92-
PERF_RESULT3_NDD=$(perf_check "${LOG2_NDD}" "$(extract_perf "${LOG1_NDD}")" 5)
93-
PERF_RESULT1_TP=$(perf_check "${LOG1_TP}" "$(extract_perf "${LOG1}")" 2)
94-
PERF_RESULT2_TP=$(perf_check "${LOG2_TP}" "$(extract_perf "${LOG2}")" 2)
128+
# On failure: re-run all benchmarks with nsys and save profiles to core_artifacts
129+
echo "Performance check failed; re-running all benchmarks with nsys profiling..."
130+
ARTIFACTS_DIR="${topdir}/core_artifacts"
131+
mkdir -p "${ARTIFACTS_DIR}"
95132

96-
echo "PERF_RESULT1=${PERF_RESULT1}"
97-
echo "PERF_RESULT2=${PERF_RESULT2}"
98-
echo "PERF_RESULT3=${PERF_RESULT3}"
99-
echo "PERF_RESULT1_TP=${PERF_RESULT1_TP}"
100-
echo "PERF_RESULT2_TP=${PERF_RESULT2_TP}"
101-
echo "PERF_RESULT1_NDD=${PERF_RESULT1_NDD}"
102-
echo "PERF_RESULT2_NDD=${PERF_RESULT2_NDD}"
103-
echo "PERF_RESULT3_NDD=${PERF_RESULT3_NDD}"
133+
NSYS_REP="enabled" run_all_benchmarks
104134

105-
# if [[ "$PERF_RESULT1" == "OK" && "$PERF_RESULT2" == "OK" && "$PERF_RESULT3" == "OK" && "$PERF_RESULT1_NDD" == "OK" && "$PERF_RESULT2_NDD" == "OK" && "$PERF_RESULT3_NDD" == "OK" ]]; then
106-
# don't check experimental decoder performance with dynamic mode
107-
if [[ "$PERF_RESULT1" == "OK" && "$PERF_RESULT2" == "OK" && "$PERF_RESULT1_TP" == "OK" && "$PERF_RESULT2_TP" == "OK" && "$PERF_RESULT3" == "OK" && "$PERF_RESULT1_NDD" == "OK" ]]; then
108-
CLEAN_AND_EXIT 0
109-
else
135+
cp -f *.nsys-rep "${ARTIFACTS_DIR}/" 2>/dev/null || true
136+
echo "nsys profiles saved to ${ARTIFACTS_DIR}"
110137
CLEAN_AND_EXIT 1
111-
fi
112138
}
113139
pushd ../..
114140
source ./qa/test_template.sh
115-
popd
141+
popd

0 commit comments

Comments
 (0)