Fix docs: CUDA requires NCCL+CUDSS_jll; MPI.Init() after using statements

Sebastien Loisel · claude · Sebastien Loisel · commit a5c78c55deee · 2026-01-09T00:00:37.000Z
- CUDA extension requires: using CUDA, NCCL, CUDSS_jll
- MPI.Init() should be called after all using statements, not before
- Fixed all occurrences across guide.md, examples.md, index.md, installation.md

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -8,9 +8,8 @@ This page provides detailed examples of using LinearAlgebraMPI.jl for various di
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -45,9 +44,8 @@ println(io0(), "Multiplication error: $err")
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -79,9 +77,8 @@ println(io0(), "Result size: $(size(Cdist))")
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -121,9 +118,8 @@ println(io0(), "Complex matrix operations completed")
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -167,9 +163,8 @@ The `transpose` function creates a lazy wrapper without transposing the data. Th
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -199,9 +194,8 @@ println(io0(), "Lazy transpose multiplication completed")
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -238,9 +232,8 @@ println(io0(), "transpose(A) * B error: $err")
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -274,9 +267,8 @@ println(io0(), "Scalar multiplication errors: $err1, $err2")
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -313,9 +305,8 @@ Here's an example of using LinearAlgebraMPI.jl for power iteration to find the d
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -351,9 +342,8 @@ LinearAlgebraMPI provides distributed sparse direct solvers using the multifront
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -391,9 +381,8 @@ println(io0(), "LDLT solve residual: $residual")
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -426,9 +415,8 @@ LDLT uses Bunch-Kaufman pivoting to handle symmetric indefinite matrices:
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -459,9 +447,8 @@ For sequences of matrices with the same sparsity pattern, the symbolic factoriza
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 using LinearAlgebra
 
@@ -501,9 +488,8 @@ println(io0(), "F2 residual: ", norm(A2 * x2_full - ones(n), Inf))
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 
 n = 100
@@ -543,9 +529,8 @@ Row-wise operations are local - no MPI communication is needed since rows are al
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using LinearAlgebra
 
 # Create a deterministic dense matrix (same on all ranks)
@@ -569,9 +554,8 @@ Column-wise operations require MPI communication to gather each full column:
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using LinearAlgebra
 
 # Create a deterministic dense matrix
@@ -594,9 +578,8 @@ The standard Julia pattern `vcat(f.(eachrow(A))...)` doesn't work with distribut
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using LinearAlgebra
 
 # Standard Julia pattern (for comparison):
diff --git a/docs/src/guide.md b/docs/src/guide.md
@@ -32,10 +32,9 @@ In Julia, `SparseMatrixCSR{T,Ti}` is a type alias for `Transpose{T, SparseMatrix
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
 using SparseArrays
+MPI.Init()
 
 # Create from native types (data is distributed automatically)
 v = VectorMPI(randn(100))
@@ -371,14 +370,14 @@ Load the GPU package **before** MPI for proper detection:
 # For Metal (macOS)
 using Metal
 using MPI
-MPI.Init()
 using LinearAlgebraMPI
+MPI.Init()
 
 # For CUDA (Linux/Windows)
-using CUDA
+using CUDA, NCCL, CUDSS_jll
 using MPI
-MPI.Init()
 using LinearAlgebraMPI
+MPI.Init()
 ```
 
 ### Converting Between CPU and GPU
@@ -460,9 +459,10 @@ For multi-GPU distributed sparse direct solves, LinearAlgebraMPI provides `CuDSS
 ### Basic Usage
 
 ```julia
-using CUDA, MPI
-MPI.Init()
+using CUDA, NCCL, CUDSS_jll
+using MPI
 using LinearAlgebraMPI
+MPI.Init()
 
 # Each MPI rank should use a different GPU
 CUDA.device!(MPI.Comm_rank(MPI.COMM_WORLD) % length(CUDA.devices()))
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -28,9 +28,8 @@ LinearAlgebraMPI.jl provides distributed matrix and vector types for parallel co
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 
 # Create distributed sparse matrix
diff --git a/docs/src/installation.md b/docs/src/installation.md
@@ -46,9 +46,8 @@ mpiexec -n 2 julia --project test/runtests.jl
 ```julia
 # CORRECT
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 # Now you can use the package
 ```
 
@@ -58,9 +57,8 @@ Create a script file (e.g., `my_program.jl`):
 
 ```julia
 using MPI
-MPI.Init()
-
 using LinearAlgebraMPI
+MPI.Init()
 using SparseArrays
 
 # Create distributed matrix