Skip to content

Commit bcde351

Browse files
committed
CUDA: Check PTX version on host side to guard PDL dispatch
Checking on `__CUDA_ARCH_LIST__` alone is insufficient for JIT, as this variable doesn't differentiate between compiling for say sm_90, sm_90a or sm_90f (so forward-jittable PTX vs. arch/family-specific PTX). Thus, one can have a bug when compiling with `DCMAKE_CUDA_ARCHITECTURES="89;90a"`, where current code would wrongly dispatch to PDL on sm_90/sm_120 in forward-JIT mode. This PR fixes this issue by checking `cudaFuncAttributes::ptxVersion` of the incoming kernel at runtime. A check on ptxVersion alone is sufficient, as device-codes will always be >= ptxVersion (and any violation of this would be a severe bug in CUDA/nvcc), see: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-code-code-code
1 parent ef570f6 commit bcde351

1 file changed

Lines changed: 40 additions & 2 deletions

File tree

ggml/src/ggml-cuda/common.cuh

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <cstdint>
88
#include <cstdlib>
99
#include <memory>
10+
#include <mutex>
1011

1112
#if defined(GGML_USE_HIP)
1213
#define GGML_COMMON_DECL_HIP
@@ -1549,6 +1550,44 @@ struct ggml_cuda_pdl_config {
15491550
ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete;
15501551

15511552
};
1553+
1554+
static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) {
1555+
const int device = ggml_cuda_get_device();
1556+
1557+
struct cache_key {
1558+
int device;
1559+
const void * kernel;
1560+
1561+
bool operator==(const cache_key & other) const { return device == other.device && kernel == other.kernel; }
1562+
};
1563+
1564+
struct cache_key_hash {
1565+
std::size_t operator()(const cache_key & key) const {
1566+
return std::hash<int>{}(key.device) ^ (std::hash<const void *>{}(key.kernel) << 1);
1567+
}
1568+
};
1569+
1570+
static std::mutex cache_mutex;
1571+
static std::unordered_map<cache_key, bool, cache_key_hash> cache;
1572+
1573+
const cache_key key = { device, kernel };
1574+
std::lock_guard<std::mutex> lock(cache_mutex);
1575+
const auto it = cache.find(key);
1576+
if (it != cache.end()) {
1577+
return it->second;
1578+
}
1579+
1580+
cudaFuncAttributes attr = {};
1581+
CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel));
1582+
1583+
// PDL device-side primitives are emitted only for PTX versions >= 90.
1584+
// We have to guard on a loaded kernel's PTX version so a kernel forward-JIT'ed
1585+
// from pre-Hopper PTX to a Hopper-or-newer GPU does not opt into PDL.
1586+
const bool can_use_pdl = attr.ptxVersion >= 90;
1587+
cache.emplace(key, can_use_pdl);
1588+
return can_use_pdl;
1589+
}
1590+
15521591
#endif //defined(GGML_CUDA_USE_PDL)
15531592

15541593

@@ -1561,8 +1600,7 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
15611600
return env == nullptr || std::atoi(env) != 0;
15621601
}();
15631602

1564-
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
1565-
if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
1603+
if (env_pdl_enabled && ggml_cuda_kernel_can_use_pdl(reinterpret_cast<const void *>(kernel))) {
15661604
auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
15671605

15681606
CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));

0 commit comments

Comments
 (0)