Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 68 additions & 35 deletions pkgs/development/python-modules/torch/default.nix
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{ stdenv, lib, fetchFromGitHub, buildPythonPackage, python,
{ stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
config, cudaSupport ? config.cudaSupport, cudaPackages, magma,
useSystemNccl ? true,
MPISupport ? false, mpi,
Expand Down Expand Up @@ -52,17 +52,8 @@

let
inherit (lib) lists strings trivial;
inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
in
inherit (cudaPackages) cudaFlags cudnn nccl;

assert cudaSupport -> stdenv.isLinux;
assert cudaSupport -> (cudaPackages.cudaMajorVersion == "11");

# confirm that cudatoolkits are sync'd across dependencies
assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit;
assert !cudaSupport || magma.cudaPackages.cudatoolkit == cudatoolkit;

let
setBool = v: if v then "1" else "0";

# https://github.com/pytorch/pytorch/blob/v2.0.1/torch/utils/cpp_extension.py#L1744
Expand Down Expand Up @@ -103,23 +94,6 @@ let
throw "No GPU targets specified"
);

cudatoolkit_joined = symlinkJoin {
name = "${cudatoolkit.name}-unsplit";
# nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
};

# Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
# LD_LIBRARY_PATH=/run/opengl-driver/lib. We only use the stub
# libcuda.so from cudatoolkit for running tests, so that we don’t have
# to recompile pytorch on every update to nvidia-x11 or the kernel.
cudaStub = linkFarm "cuda-stub" [{
name = "libcuda.so.1";
path = "${cudatoolkit}/lib/stubs/libcuda.so";
}];
cudaStubEnv = lib.optionalString cudaSupport
"LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";

rocmtoolkit_joined = symlinkJoin {
name = "rocm-merged";

Expand Down Expand Up @@ -160,6 +134,12 @@ in buildPythonPackage rec {
# base is 10.12. Until we upgrade, we can fall back on the older
# pthread support.
./pthreadpool-disable-gcd.diff
] ++ lib.optionals stdenv.isLinux [
# Propagate CUPTI to Kineto by overriding the search path with environment variables.
(fetchpatch {
url = "https://github.com/pytorch/pytorch/pull/108847/commits/7ae4d7c0e2dec358b4fe81538efe9da5eb580ec9.patch";
hash = "sha256-skFaDg98xcJqJfzxWk+qhUxPLHDStqvd0mec3PgksIg=";
})
Comment thread
ConnorBaker marked this conversation as resolved.
Outdated
];

postPatch = lib.optionalString rocmSupport ''
Expand All @@ -184,6 +164,13 @@ in buildPythonPackage rec {
--replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \
"set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitString "." hip.version))})"
''
# Detection of NCCL version doesn't work particularly well when using the static binary.
Comment thread
ConnorBaker marked this conversation as resolved.
Outdated
+ lib.optionalString cudaSupport ''
substituteInPlace cmake/Modules/FindNCCL.cmake \
--replace \
'message(FATAL_ERROR "Found NCCL header version and library version' \
'message(WARNING "Found NCCL header version and library version'
''
# error: no member named 'aligned_alloc' in the global namespace; did you mean simply 'aligned_alloc'
# This lib overrided aligned_alloc hence the error message. Tltr: his function is linkable but not in header.
+ lib.optionalString (stdenv.isDarwin && lib.versionOlder stdenv.targetPlatform.darwinSdkVersion "11.0") ''
Expand All @@ -192,12 +179,16 @@ in buildPythonPackage rec {
inline void *aligned_alloc(size_t align, size_t size)'
'';

# NOTE(@connorbaker): Though we do not disable Gloo or MPI when building with CUDA support, caution should be taken
# when using the different backends. Gloo's GPU support isn't great, and MPI and CUDA can't be used at the same time
# without extreme care to ensure they don't lock each other out of shared resources.
# For more, see https://github.com/open-mpi/ompi/issues/7733#issuecomment-629806195.
preConfigure = lib.optionalString cudaSupport ''
export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we refer to backendStdenv.cc somehow else now, or did we just drop it? In the latter case, I suspect things are going to break again next time nixpkgs bumps glibc

'' + lib.optionalString (cudaSupport && cudnn != null) ''
export CUDNN_INCLUDE_DIR=${cudnn.dev}/include
export CUDNN_LIB_DIR=${cudnn.lib}/lib
export CUPTI_INCLUDE_DIR=${cudaPackages.cuda_cupti.dev}/include
export CUPTI_LIBRARY_DIR=${cudaPackages.cuda_cupti.lib}/lib
'' + lib.optionalString rocmSupport ''
export ROCM_PATH=${rocmtoolkit_joined}
export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
Expand Down Expand Up @@ -256,6 +247,7 @@ in buildPythonPackage rec {
PYTORCH_BUILD_NUMBER = 0;

USE_SYSTEM_NCCL = setBool useSystemNccl; # don't build pytorch's third_party NCCL
USE_STATIC_NCCL = setBool useSystemNccl;

# Suppress a weird warning in mkl-dnn, part of ideep in pytorch
# (upstream seems to have fixed this in the wrong place?)
Expand Down Expand Up @@ -286,12 +278,43 @@ in buildPythonPackage rec {
pybind11
pythonRelaxDepsHook
removeReferencesTo
] ++ lib.optionals cudaSupport [ cudatoolkit_joined ]
++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
] ++ lib.optionals cudaSupport (with cudaPackages; [
autoAddOpenGLRunpathHook
cuda_nvcc
])
++ lib.optionals rocmSupport [ rocmtoolkit_joined ];

buildInputs = [ blas blas.provider pybind11 ]
++ lib.optionals stdenv.isLinux [ linuxHeaders_5_19 ] # TMP: avoid "flexible array member" errors for now
++ lib.optionals cudaSupport [ cudnn.dev cudnn.lib nccl ]
++ lib.optionals cudaSupport (with cudaPackages; [
cuda_cccl.dev # <thrust/*>
cuda_cudart # cuda_runtime.h and libraries
cuda_cupti.dev # For kineto
cuda_cupti.lib # For kineto
cuda_nvcc.dev # crt/host_config.h; even though we include this in nativeBuildinputs, it's needed here too
cuda_nvml_dev.dev # <nvml.h>
cuda_nvrtc.dev
cuda_nvrtc.lib
cuda_nvtx.dev
cuda_nvtx.lib # -llibNVToolsExt
cudnn.dev
cudnn.lib
libcublas.dev
libcublas.lib
libcufft.dev
libcufft.lib
libcurand.dev
libcurand.lib
libcusolver.dev
libcusolver.lib
libcusparse.dev
libcusparse.lib
nccl.dev # Provides nccl.h AND a static copy of NCCL!
] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
cuda_nvprof.dev # <cuda_profiler_api.h>
] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
cuda_profiler_api.dev # <cuda_profiler_api.h>
])
++ lib.optionals rocmSupport [ openmp ]
++ lib.optionals (cudaSupport || rocmSupport) [ magma ]
++ lib.optionals stdenv.isLinux [ numactl ]
Expand Down Expand Up @@ -335,7 +358,6 @@ in buildPythonPackage rec {

checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
"runHook preCheck"
cudaStubEnv
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm. Why does it need the LD_LIBRARY_PATH anymore?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting doCheck = true;, it still works 🤷‍♂️

Should it not?

Copy link
Copy Markdown
Contributor

@SomeoneSerge SomeoneSerge Sep 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this makes sense, I'm not sure why the stub was ever needed (maybe some of the binaries had libcuda.so in DT_NEEDED, and now they only do dlopen?)

Good riddance then

"${python.interpreter} test/run_test.py"
"--exclude"
(concatStringsSep " " [
Expand Down Expand Up @@ -419,6 +441,17 @@ in buildPythonPackage rec {
license = licenses.bsd3;
maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
platforms = with platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin;
broken = rocmSupport && cudaSupport; # CUDA and ROCm are mutually exclusive
broken = builtins.any trivial.id [
# CUDA and ROCm are mutually exclusive
(cudaSupport && rocmSupport)
# CUDA is only supported on Linux
(cudaSupport && !stdenv.isLinux)
# Only CUDA 11 is currently supported
(cudaSupport && (cudaPackages.cudaMajorVersion != "11"))
# MPI cudatoolkit does not match cudaPackages.cudatoolkit
(MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit))
# Magma cudaPackages does not match cudaPackages
(cudaSupport && (magma.cudaPackages != cudaPackages))
];
};
}