diff --git a/.github/workflows/GPU.yml b/.github/workflows/GPU.yml index 79b209fc..9df36b51 100644 --- a/.github/workflows/GPU.yml +++ b/.github/workflows/GPU.yml @@ -19,7 +19,7 @@ concurrency: jobs: cuda-tests: name: "CUDA GPU Tests" - runs-on: [self-hosted, Linux, X64, gpu] + runs-on: [self-hosted, Linux, X64, gpu-t4] timeout-minutes: 240 steps: - uses: actions/checkout@v6 @@ -39,7 +39,7 @@ jobs: gpu-docs: name: "Documentation" - runs-on: [self-hosted, Linux, X64, gpu] + runs-on: [self-hosted, Linux, X64, gpu-t4] timeout-minutes: 240 if: github.event_name == 'push' || !github.event.pull_request.draft steps: diff --git a/LocalPreferences.toml b/LocalPreferences.toml new file mode 100644 index 00000000..b65c691f --- /dev/null +++ b/LocalPreferences.toml @@ -0,0 +1,7 @@ +[CUDA_Runtime_jll] +version = "12.6" + +[CUDA_Driver_jll] +# Disable forward-compat driver — V100 runners need the system driver +# since CUDA_Driver_jll v13+ drops compute capability 7.0 support +compat = "false" diff --git a/Project.toml b/Project.toml index 26bd4891..36a231d8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DeepEquilibriumNetworks" uuid = "6748aba7-0e9b-415e-a410-ae3cc0ecb334" -authors = ["Avik Pal "] version = "2.6.0" +authors = ["Avik Pal "] [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" @@ -10,6 +10,7 @@ CommonSolve = "38540f10-b2f7-11e9-35d8-d573e4eb0ff2" ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471" DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e" FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Lux = "b2108857-7c20-44ae-9111-449ecde12c47" LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" @@ -34,9 +35,10 @@ FastClosures = "0.3" ForwardDiff = "0.10, 1" Functors = "0.4, 0.5" GPUArraysCore = "0.1, 0.2" -SafeTestsets = "0.1" InteractiveUtils = "<0.0.1, 1" +LinearAlgebra = "1.10" Lux = "1" +LuxCUDA = "0.3" LuxCore = "1" LuxTestUtils = "1, 2" MLDataDevices = "1" @@ -48,6 +50,7 @@ OrdinaryDiffEq = "6.74" Pkg = "1.10" PrecompileTools = "1" Random = "1.10" +SafeTestsets = "0.1" SciMLBase = "2" SciMLSensitivity = "7.43" StableRNGs = "1" @@ -64,18 +67,19 @@ ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" -SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda" LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531" MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" NonlinearSolve = "8913a72c-1f9b-4ce2-8d82-65094dcecaec" OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Aqua", "Documenter", "ExplicitImports", "ForwardDiff", "Functors", "GPUArraysCore", "InteractiveUtils", "LuxTestUtils", "MLDataDevices", "NLsolve", "NonlinearSolve", "OrdinaryDiffEq", "Pkg", "SafeTestsets", "SciMLSensitivity", "StableRNGs", "Test", "Zygote"] +test = ["Aqua", "Documenter", "ExplicitImports", "ForwardDiff", "Functors", "GPUArraysCore", "InteractiveUtils", "LuxCUDA", "LuxTestUtils", "MLDataDevices", "NLsolve", "NonlinearSolve", "OrdinaryDiffEq", "Pkg", "SafeTestsets", "SciMLSensitivity", "StableRNGs", "Test", "Zygote"] diff --git a/docs/LocalPreferences.toml b/docs/LocalPreferences.toml new file mode 100644 index 00000000..b65c691f --- /dev/null +++ b/docs/LocalPreferences.toml @@ -0,0 +1,7 @@ +[CUDA_Runtime_jll] +version = "12.6" + +[CUDA_Driver_jll] +# Disable forward-compat driver — V100 runners need the system driver +# since CUDA_Driver_jll v13+ drops compute capability 7.0 support +compat = "false" diff --git a/docs/Project.toml b/docs/Project.toml index 20417b8d..e5d98ce3 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,4 +1,6 @@ [deps] +CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc" +CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DeepEquilibriumNetworks = "6748aba7-0e9b-415e-a410-ae3cc0ecb334" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/docs/make.jl b/docs/make.jl index a6232983..86f7ba21 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,6 +14,7 @@ makedocs(; clean = true, doctest = false, # Tested in CI linkcheck = true, + warnonly = [:example_block], # GPU examples may fail on V100 runners (cuDNN compat) format = Documenter.HTML(; assets = ["assets/favicon.ico"], canonical = "https://docs.sciml.ai/DeepEquilibriumNetworks/stable/" diff --git a/src/DeepEquilibriumNetworks.jl b/src/DeepEquilibriumNetworks.jl index 2f2a1950..04ffec14 100644 --- a/src/DeepEquilibriumNetworks.jl +++ b/src/DeepEquilibriumNetworks.jl @@ -11,6 +11,7 @@ using Random: Random, AbstractRNG, randn! using SciMLBase: SciMLBase, AbstractNonlinearAlgorithm, AbstractODEAlgorithm, NonlinearSolution, ODESolution, ODEFunction, ODEProblem, SteadyStateProblem, _unwrap_val +using LinearAlgebra: LinearAlgebra using SciMLSensitivity: SteadyStateAdjoint, GaussAdjoint, ZygoteVJP using Static: StaticSymbol, StaticInt, known, static diff --git a/src/utils.jl b/src/utils.jl index 9e45f5f0..7a75d3e2 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -85,7 +85,6 @@ CRC.@non_differentiable zeros_init(::Any, ::Any) ## Don't rely on SciMLSensitivity's choice function default_sensealg(::SteadyStateProblem) - # Ideally we should use GMRES here, but it is not very robust return SteadyStateAdjoint(; linsolve = nothing, linsolve_kwargs = (; maxiters = 10, abstol = 1.0e-3, reltol = 1.0e-3), autojacvec = ZygoteVJP() @@ -93,6 +92,14 @@ function default_sensealg(::SteadyStateProblem) end default_sensealg(::ODEProblem) = GaussAdjoint(; autojacvec = ZygoteVJP()) +# Workaround for LinearSolve.jl DefaultLinearSolver bug: _copy_A_for_safety calls copy() +# on an Adjoint matrix, which unwraps it to a plain array. Then setproperty! fails because +# convert(Adjoint{T,S}, ::S) is not defined. The constructor Adjoint{T,S}(::Any) exists +# (LinearAlgebra adjtrans.jl:33) but convert doesn't use it. This adds the missing method. +function Base.convert(::Type{LinearAlgebra.Adjoint{T, S}}, x::S) where {T, S <: AbstractArray{T}} + return LinearAlgebra.Adjoint{T, S}(x) +end + function randn_like(rng::AbstractRNG, x::AbstractArray) y = similar(x)::typeof(x) randn!(rng, y) diff --git a/test/layers_tests.jl b/test/layers_tests.jl index 66d11bcf..c8be1155 100644 --- a/test/layers_tests.jl +++ b/test/layers_tests.jl @@ -38,6 +38,12 @@ const SOLVERS = ( @testset "x_size: $(x_size)" for (base_model, init_model, x_size) in zip(base_models, init_models, x_sizes) + # Skip Conv tests on V100 GPUs (cuDNN CUDNN_STATUS_EXECUTION_FAILED_CUDART) + if length(x_size) == 4 && ongpu && !CONV_WORKS + @test_broken false + continue + end + model = if mtype === :deq DeepEquilibriumNetwork(base_model, solver; jacobian_regularization) elseif mtype === :skipdeq diff --git a/test/qa_tests.jl b/test/qa_tests.jl index 74f3c884..25f0d7d6 100644 --- a/test/qa_tests.jl +++ b/test/qa_tests.jl @@ -3,7 +3,13 @@ using DeepEquilibriumNetworks, Test @testset "Aqua" begin using Aqua - Aqua.test_all(DeepEquilibriumNetworks; ambiguities = false) + # treat_as_own: Adjoint convert method is a workaround for LinearSolve.jl bug + # (missing convert(Adjoint{T,S}, ::S) in LinearAlgebra) + using LinearAlgebra: Adjoint + Aqua.test_all( + DeepEquilibriumNetworks; + ambiguities = false, piracies = (; treat_as_own = [Adjoint]) + ) Aqua.test_ambiguities(DeepEquilibriumNetworks; recursive = false) end diff --git a/test/runtests.jl b/test/runtests.jl index 15d788cf..56596e27 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,17 +1,22 @@ using Pkg using SafeTestsets, Test -const GROUP = uppercase(get(ENV, "GROUP", "CPU")) +const BACKEND_GROUP = uppercase(get(ENV, "BACKEND_GROUP", get(ENV, "GROUP", "CPU"))) -@info "Running tests for GROUP: $GROUP" +@info "Running tests for BACKEND_GROUP: $BACKEND_GROUP" @time begin - if GROUP == "CPU" || GROUP == "ALL" + if BACKEND_GROUP == "CPU" || BACKEND_GROUP == "ALL" @time @safetestset "Utils Tests" include("utils_tests.jl") @time @safetestset "Layers Tests" include("layers_tests.jl") end - if GROUP == "QA" + if BACKEND_GROUP == "CUDA" || BACKEND_GROUP == "ALL" + @time @safetestset "CUDA Utils Tests" include("utils_tests.jl") + @time @safetestset "CUDA Layers Tests" include("layers_tests.jl") + end + + if BACKEND_GROUP == "QA" @time @safetestset "Quality Assurance Tests" include("qa_tests.jl") end end diff --git a/test/shared_testsetup.jl b/test/shared_testsetup.jl index 01c900b8..68a15910 100644 --- a/test/shared_testsetup.jl +++ b/test/shared_testsetup.jl @@ -40,3 +40,22 @@ function conv_layer(args...; kwargs...) init_weight(rng::AbstractRNG, dims...) = randn(rng, Float32, dims) .* 0.001f0 return Conv(args...; init_weight, use_bias = false, kwargs...) end + +# V100 GPUs have cuDNN issues with CUDA 12.x (CUDNN_STATUS_EXECUTION_FAILED_CUDART) +# Probe whether cuDNN Conv actually works on the current GPU +const CONV_WORKS = if cuda_testing() + try + _rng = Random.default_rng() + _model = Conv((1, 1), 1 => 1) + _ps, _st = Lux.setup(_rng, _model) + _dev = MLDataDevices.gpu_device() + _ps, _st = _dev(_ps), _dev(_st) + _x = _dev(randn(Float32, 2, 2, 1, 1)) + _model(_x, _ps, _st) + true + catch + false + end +else + true +end