1-
21#! /bin/bash
32# Copyright (c) Meta Platforms, Inc. and affiliates.
43# All rights reserved.
@@ -22,7 +21,7 @@ install_swiftshader() {
2221 tar -C " ${_swiftshader_dir} " -xzf " ${_tmp_archive} "
2322
2423 export VK_ICD_FILENAMES=" ${_swiftshader_dir} /swiftshader/build/Linux/vk_swiftshader_icd.json"
25- export LD_LIBRARY_PATH=" ${_swiftshader_dir} /swiftshader/build/Linux/"
24+ export LD_LIBRARY_PATH=" ${_swiftshader_dir} /swiftshader/build/Linux/: ${LD_LIBRARY_PATH :- } "
2625 export ETVK_USING_SWIFTSHADER=1
2726}
2827
@@ -43,7 +42,125 @@ install_vulkan_sdk() {
4342 export PATH=" ${PATH} :${_vulkan_sdk_dir} /${VULKAN_SDK_VERSION} /x86_64/bin/"
4443}
4544
45+ _maybe_sudo () {
46+ if [ " $( id -u) " -eq 0 ]; then
47+ " $@ "
48+ else
49+ sudo " $@ "
50+ fi
51+ }
52+
53+ install_glslc () {
54+ # The glslc shipped in the LunarG SDK is dynamically linked against a newer
55+ # glibc/libstdc++ than the manylinux_2_28 / AlmaLinux 8 CUDA runner image
56+ # provides (glibc 2.28), where it fails to load with "GLIBC_2.29 not found".
57+ # conda-forge's shaderc is built against an old sysroot, runs there, and is
58+ # recent enough for the GL_EXT_integer_dot_product / GL_KHR_cooperative_matrix
59+ # extensions the Vulkan shaders use. Install it into an isolated prefix so the
60+ # base conda env that builds ExecuTorch is left untouched, then put it on PATH.
61+ _glslc_prefix=/tmp/shaderc
62+ conda create -y -p " ${_glslc_prefix} " -c conda-forge shaderc
63+ export PATH=" ${_glslc_prefix} /bin:${PATH} "
64+ }
65+
66+ install_vulkan_loader () {
67+ # libvulkan.so.1 (the Khronos loader that volk dlopen()s at runtime) is not part
68+ # of the NVIDIA driver and is absent from the CUDA builder image; vulkan-tools
69+ # provides vulkaninfo for the device sanity check. Both ship as native el8 RPMs.
70+ if command -v dnf > /dev/null 2>&1 ; then
71+ _maybe_sudo dnf install -y vulkan-loader vulkan-tools
72+ fi
73+ }
74+
75+ _find_nvidia_vulkan_library () {
76+ # NVIDIA implements its Vulkan ICD inside libGLX_nvidia.so.0. The NVIDIA
77+ # container runtime mounts this library into the container (it is pulled from
78+ # the driver's ldcache when NVIDIA_DRIVER_CAPABILITIES includes graphics/all),
79+ # so prefer ldconfig and fall back to the usual mount locations.
80+ local lib cand
81+ lib=" $( ldconfig -p 2> /dev/null | awk ' /libGLX_nvidia\.so\.0/ {print $NF; exit}' ) "
82+ if [ -z " ${lib} " ]; then
83+ for cand in /usr/lib64/libGLX_nvidia.so.0 \
84+ /usr/lib/x86_64-linux-gnu/libGLX_nvidia.so.0 \
85+ /usr/lib/libGLX_nvidia.so.0; do
86+ if [ -e " ${cand} " ]; then
87+ lib=" ${cand} "
88+ break
89+ fi
90+ done
91+ fi
92+ printf ' %s' " ${lib} "
93+ }
94+
95+ _vulkan_has_real_device () {
96+ # True if the loader enumerates a hardware GPU. vulkaninfo can exit non-zero
97+ # for unrelated reasons (no display/WSI), so key off the reported deviceType.
98+ command -v vulkaninfo > /dev/null 2>&1 || return 0
99+ vulkaninfo --summary 2> /dev/null |
100+ grep -qE ' PHYSICAL_DEVICE_TYPE_(DISCRETE|INTEGRATED|VIRTUAL)_GPU'
101+ }
102+
103+ setup_real_gpu_icd () {
104+ # Select a Vulkan ICD so the runtime exercises the real GPU when one is usable.
105+ # Two quirks of the CUDA CI image make this non-trivial:
106+ # 1. The NVIDIA container runtime mounts the driver's Vulkan library but does
107+ # not register its ICD manifest, so the loader never discovers the GPU on
108+ # its own. We synthesize the manifest and pin the loader to it.
109+ # 2. Installing vulkan-loader/vulkan-tools pulls in mesa-vulkan-drivers,
110+ # which drop Intel/AMD/lavapipe manifests for absent hardware. lavapipe
111+ # fails vkCreateInstance on this image and, because the loader walks every
112+ # manifest in icd.d, that poisons device enumeration for the whole
113+ # process. Pinning VK_ICD_FILENAMES makes the loader ignore icd.d, so the
114+ # broken stubs cannot interfere.
115+ local nvidia_lib
116+ nvidia_lib=" $( _find_nvidia_vulkan_library) "
117+ if [ -n " ${nvidia_lib} " ]; then
118+ local icd=/tmp/nvidia_icd.json
119+ cat > " ${icd} " << JSON
120+ {
121+ "file_format_version": "1.0.0",
122+ "ICD": {
123+ "library_path": "${nvidia_lib} ",
124+ "api_version": "1.3.0"
125+ }
126+ }
127+ JSON
128+ export VK_ICD_FILENAMES=" ${icd} "
129+ unset ETVK_USING_SWIFTSHADER || true
130+ if _vulkan_has_real_device; then
131+ echo " Real NVIDIA GPU selected; pinned Vulkan ICD to ${nvidia_lib} "
132+ return
133+ fi
134+ echo " WARNING: ${nvidia_lib} present but no GPU enumerated; using SwiftShader."
135+ # Surface why the NVIDIA driver did not enumerate (e.g. a missing dependency
136+ # of libGLX_nvidia, or no render node) so the fallback is diagnosable in CI.
137+ if command -v vulkaninfo > /dev/null 2>&1 ; then
138+ echo " --- NVIDIA Vulkan ICD diagnostic ---"
139+ VK_LOADER_DEBUG=warn vulkaninfo --summary 2>&1 | head -40 || true
140+ echo " --- end diagnostic ---"
141+ fi
142+ unset VK_ICD_FILENAMES
143+ else
144+ echo " WARNING: no NVIDIA Vulkan driver library found; using SwiftShader."
145+ fi
146+ install_swiftshader
147+ }
148+
46149VULKAN_SDK_VERSION=" 1.4.321.1"
47150
48- install_swiftshader
49- install_vulkan_sdk " ${VULKAN_SDK_VERSION} "
151+ # The no-argument default installs SwiftShader so the existing CPU-runner CI is
152+ # unchanged. Pass "real-gpu" to prefer a real system ICD when one is present.
153+ case " ${1:- swiftshader} " in
154+ real-gpu)
155+ # Do not download the LunarG SDK here: its prebuilt glslc cannot run on the
156+ # old-glibc CUDA image. glslc comes from conda-forge and the loader from the
157+ # system package manager instead.
158+ install_vulkan_loader
159+ install_glslc
160+ setup_real_gpu_icd
161+ ;;
162+ swiftshader | * )
163+ install_swiftshader
164+ install_vulkan_sdk " ${VULKAN_SDK_VERSION} "
165+ ;;
166+ esac
0 commit comments