Refactor precompilation and transpose materialization

Sébastien Loisel · Sébastien Loisel · commit cb0191e75474 · 2025-12-17T17:13:34.000+01:00
- Replace PrecompileTools with SnoopCompile-generated static precompile.jl
- Rename materialize_transpose to SparseMatrixMPI(transpose(A)) constructor
- Refactor SparseMatrixMPI(SparseMatrixCSC) to delegate to SparseMatrixMPI_local
- Add scripts/generate_precompile.jl for regenerating precompile directives
- Update documentation for new transpose materialization API
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -137,6 +137,7 @@ Note: Uses `transpose()` (not adjoint `'`) to correctly handle complex values wi
 `transpose(A)` returns `Transpose{T, SparseMatrixMPI{T}}` (lazy wrapper). Materialization happens automatically when needed:
 - `transpose(A) * transpose(B)` → `transpose(B * A)` (stays lazy)
 - `transpose(A) * B` or `A * transpose(B)` → materializes via `TransposePlan`
+- `SparseMatrixMPI(transpose(A))` → explicitly materialize the transpose (cached bidirectionally)
 
 ### Indexing Operations
 
diff --git a/Project.toml b/Project.toml
@@ -8,19 +8,18 @@ Blake3Hash = "8f478455-a32d-4928-b0e4-72b19a7d5574"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 MUMPS = "55d2b088-9f4e-11e9-26c0-150b02ea6a46"
-PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [compat]
 Blake3Hash = "0.3"
 MPI = "0.20"
 MUMPS = "1.5"
-PrecompileTools = "1"
 julia = "1.10"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SnoopCompile = "aa65fe97-06da-5843-b5b1-d5d13cad87d2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -52,9 +52,10 @@ C = A ⊛ B   # Parallel multiplication using available threads
 ### Transpose and Adjoint
 
 ```julia
-transpose(A)   # Lazy transpose
-conj(A)        # Conjugate (new matrix)
-A'             # Adjoint (conjugate transpose, lazy)
+transpose(A)              # Lazy transpose
+conj(A)                   # Conjugate (new matrix)
+A'                        # Adjoint (conjugate transpose, lazy)
+SparseMatrixMPI(transpose(A))  # Materialize lazy transpose (cached)
 ```
 
 ### Matrix-Vector Multiplication
diff --git a/scripts/generate_precompile.jl b/scripts/generate_precompile.jl
@@ -0,0 +1,167 @@
+# Generate precompile directives using SnoopCompile
+#
+# Run this script under MPI to generate src/precompile.jl:
+#   mpiexec -n 1 julia --project=. scripts/generate_precompile.jl
+#
+# Only run with a single MPI rank to avoid file conflicts.
+
+using SnoopCompile
+using SnoopCompile.SnoopCompileCore
+using SparseArrays
+using LinearAlgebra
+using MPI
+
+# Initialize MPI first
+MPI.Init()
+
+# Load the package outside of @snoop_inference
+using LinearAlgebraMPI
+
+# Now snoop on the actual workload
+tinf = @snoop_inference begin
+    # Small test data for precompilation
+    n = 8
+
+    # Sparse matrix (tridiagonal) - Float64
+    I_sp = Int[]; J_sp = Int[]; V_sp = Float64[]
+    for i in 1:n
+        push!(I_sp, i); push!(J_sp, i); push!(V_sp, 4.0)
+        if i > 1
+            push!(I_sp, i); push!(J_sp, i-1); push!(V_sp, -1.0)
+            push!(I_sp, i-1); push!(J_sp, i); push!(V_sp, -1.0)
+        end
+    end
+    A_sparse_f64 = sparse(I_sp, J_sp, V_sp, n, n)
+
+    # Sparse matrix - ComplexF64
+    A_sparse_c64 = sparse(I_sp, J_sp, ComplexF64.(V_sp), n, n)
+
+    # Dense matrix - Float64
+    A_dense_f64 = Float64[i == j ? 4.0 : (abs(i-j) == 1 ? -1.0 : 0.0) for i in 1:n, j in 1:n]
+
+    # Dense matrix - ComplexF64
+    A_dense_c64 = ComplexF64.(A_dense_f64)
+
+    # Vectors
+    v_f64 = ones(Float64, n)
+    v_c64 = ones(ComplexF64, n)
+
+    # Identity for SPD construction
+    I_sparse = sparse(1.0 * LinearAlgebra.I, n, n)
+
+    # === VectorMPI operations (Float64) ===
+    v = VectorMPI(v_f64)
+    w = VectorMPI(2.0 .* v_f64)
+    _ = v + w
+    _ = v - w
+    _ = 2.0 * v
+    _ = v * 2.0
+    _ = norm(v)
+    _ = dot(v, w)
+    _ = conj(v)
+    _ = length(v)
+    _ = size(v)
+
+    # VectorMPI (ComplexF64)
+    vc = VectorMPI(v_c64)
+    _ = conj(vc)
+    _ = norm(vc)
+
+    # === SparseMatrixMPI operations (Float64) ===
+    A = SparseMatrixMPI{Float64}(A_sparse_f64)
+    B = SparseMatrixMPI{Float64}(A_sparse_f64)
+    _ = A + B
+    _ = A - B
+    _ = 2.0 * A
+    _ = A * v
+    _ = A * B
+    _ = transpose(A)
+    At = SparseMatrixMPI(transpose(A))
+    _ = size(A)
+    _ = nnz(A)
+    _ = norm(A)
+
+    # SparseMatrixMPI (ComplexF64)
+    Ac = SparseMatrixMPI{ComplexF64}(A_sparse_c64)
+    _ = Ac * vc
+
+    # === MatrixMPI operations (Float64) ===
+    D = MatrixMPI(A_dense_f64)
+    _ = 2.0 * D
+    _ = D * v
+    _ = transpose(D)
+    Dt = copy(transpose(D))  # Materialize dense transpose
+    _ = size(D)
+    _ = norm(D)
+
+    # MatrixMPI (ComplexF64)
+    Dc = MatrixMPI(A_dense_c64)
+    _ = Dc * vc
+
+    # === Mixed operations ===
+    _ = A * D  # Sparse * Dense
+
+    # === Indexing ===
+    _ = v[1]
+    _ = A[1, 1]
+    _ = D[1, 1]
+
+    # === Factorization (MUMPS) ===
+    # Make symmetric positive definite: A + A^T + 10I
+    At_mat = SparseMatrixMPI(transpose(A))
+    I_dist = SparseMatrixMPI{Float64}(I_sparse)
+    A_spd = A + At_mat + I_dist * 10.0
+    F = LinearAlgebra.ldlt(A_spd)
+    x = F \ v
+    finalize!(F)
+
+    # LU factorization
+    F_lu = LinearAlgebra.lu(A)
+    x = F_lu \ v
+    finalize!(F_lu)
+
+    # === Block operations ===
+    _ = cat(v, w; dims=1)
+    _ = blockdiag(A, B)
+
+    # === Conversions ===
+    _ = Vector(v)
+    _ = Matrix(D)
+    _ = SparseMatrixCSC(A)
+
+    # Clear caches
+    clear_plan_cache!()
+end
+
+# Generate precompile directives
+# parcel returns (total_time, Vector{Pair{Module, (time, MethodInstances)}})
+_, pc = SnoopCompile.parcel(tinf)
+
+# Filter for LinearAlgebraMPI only
+pc_filtered = filter(p -> p.first == LinearAlgebraMPI, pc)
+
+# Extract method instances for our module
+if !isempty(pc_filtered)
+    _, method_instances = pc_filtered[1].second
+
+    # Write the precompile file
+    outfile = joinpath(@__DIR__, "..", "src", "precompile.jl")
+    open(outfile, "w") do io
+        println(io, "# Precompile directives generated by SnoopCompile")
+        println(io, "# Regenerate with: mpiexec -n 1 julia --project=. scripts/generate_precompile.jl")
+        println(io, "#")
+        println(io, "# $(length(method_instances)) method instances")
+        println(io)
+        println(io, "function _precompile_()")
+        println(io, "    ccall(:jl_generating_output, Cint, ()) == 1 || return nothing")
+        SnoopCompile.write(io, method_instances)
+        println(io, "end")
+        println(io)
+        println(io, "_precompile_()")
+    end
+
+    println("Generated precompile directives: $outfile")
+    println("Found $(length(method_instances)) method instances")
+else
+    println("No method instances found for LinearAlgebraMPI")
+end
diff --git a/src/LinearAlgebraMPI.jl b/src/LinearAlgebraMPI.jl
@@ -336,7 +336,7 @@ function LinearAlgebra.issymmetric(A::SparseMatrixMPI{T}) where T
         return false
     end
 
-    At = materialize_transpose(A)
+    At = SparseMatrixMPI(transpose(A))
     return _compare_rows_distributed(A, At)
 end
 
@@ -363,7 +363,7 @@ end
 Solve transpose(A)*x = b using LDLT if transpose(A) is symmetric, otherwise LU.
 """
 function Base.:\(At::Transpose{T,SparseMatrixMPI{T}}, b::VectorMPI{T}) where T
-    A_t = materialize_transpose(At.parent)
+    A_t = SparseMatrixMPI(At)
     F = issymmetric(A_t) ? LinearAlgebra.ldlt(A_t) : LinearAlgebra.lu(A_t)
     x = F \ b
     finalize!(F)
@@ -662,133 +662,8 @@ end
 # Precompilation Workload
 # ============================================================================
 
-using PrecompileTools
-
-@setup_workload begin
-    # Small test data for precompilation (runs with single MPI rank)
-    n = 8
-
-    # Sparse matrix (tridiagonal) - Float64
-    I_sp = Int[]; J_sp = Int[]; V_sp = Float64[]
-    for i in 1:n
-        push!(I_sp, i); push!(J_sp, i); push!(V_sp, 4.0)
-        if i > 1
-            push!(I_sp, i); push!(J_sp, i-1); push!(V_sp, -1.0)
-            push!(I_sp, i-1); push!(J_sp, i); push!(V_sp, -1.0)
-        end
-    end
-    A_sparse_f64 = sparse(I_sp, J_sp, V_sp, n, n)
-
-    # Sparse matrix - ComplexF64
-    A_sparse_c64 = sparse(I_sp, J_sp, ComplexF64.(V_sp), n, n)
-
-    # Dense matrix - Float64
-    A_dense_f64 = Float64[i == j ? 4.0 : (abs(i-j) == 1 ? -1.0 : 0.0) for i in 1:n, j in 1:n]
-
-    # Dense matrix - ComplexF64
-    A_dense_c64 = ComplexF64.(A_dense_f64)
-
-    # Vectors
-    v_f64 = ones(Float64, n)
-    v_c64 = ones(ComplexF64, n)
-
-    # Identity for SPD construction
-    I_sparse = sparse(1.0 * LinearAlgebra.I, n, n)
-
-    @compile_workload begin
-        # Try to initialize MPI - may fail during precompilation under mpiexec
-        mpi_ok = try
-            MPI.Init()
-            true
-        catch
-            false
-        end
-
-        if mpi_ok
-        # === VectorMPI operations (Float64) ===
-        v = VectorMPI(v_f64)
-        w = VectorMPI(2.0 .* v_f64)
-        _ = v + w
-        _ = v - w
-        _ = 2.0 * v
-        _ = v * 2.0
-        _ = norm(v)
-        _ = dot(v, w)
-        _ = conj(v)
-        _ = length(v)
-        _ = size(v)
-
-        # VectorMPI (ComplexF64)
-        vc = VectorMPI(v_c64)
-        _ = conj(vc)
-        _ = norm(vc)
-
-        # === SparseMatrixMPI operations (Float64) ===
-        A = SparseMatrixMPI{Float64}(A_sparse_f64)
-        B = SparseMatrixMPI{Float64}(A_sparse_f64)
-        _ = A + B
-        _ = A - B
-        _ = 2.0 * A
-        _ = A * v
-        _ = A * B
-        _ = transpose(A)
-        At = materialize_transpose(A)
-        _ = size(A)
-        _ = nnz(A)
-        _ = norm(A)
-
-        # SparseMatrixMPI (ComplexF64)
-        Ac = SparseMatrixMPI{ComplexF64}(A_sparse_c64)
-        _ = Ac * vc
-
-        # === MatrixMPI operations (Float64) ===
-        D = MatrixMPI(A_dense_f64)
-        _ = 2.0 * D
-        _ = D * v
-        _ = transpose(D)
-        Dt = copy(transpose(D))  # Materialize dense transpose
-        _ = size(D)
-        _ = norm(D)
-
-        # MatrixMPI (ComplexF64)
-        Dc = MatrixMPI(A_dense_c64)
-        _ = Dc * vc
-
-        # === Mixed operations ===
-        _ = A * D  # Sparse * Dense
-
-        # === Indexing ===
-        _ = v[1]
-        _ = A[1, 1]
-        _ = D[1, 1]
-
-        # === Factorization (MUMPS) ===
-        # Make symmetric positive definite: A + A^T + 10I
-        At_mat = materialize_transpose(A)
-        I_dist = SparseMatrixMPI{Float64}(I_sparse)
-        A_spd = A + At_mat + I_dist * 10.0
-        F = LinearAlgebra.ldlt(A_spd)
-        x = F \ v
-        finalize!(F)
-
-        # LU factorization
-        F_lu = LinearAlgebra.lu(A)
-        x = F_lu \ v
-        finalize!(F_lu)
-
-        # === Block operations ===
-        _ = cat(v, w; dims=1)
-        _ = blockdiag(A, B)
-
-        # === Conversions ===
-        _ = Vector(v)
-        _ = Matrix(D)
-        _ = SparseMatrixCSC(A)
-
-        # Clear caches
-        clear_plan_cache!()
-        end  # if mpi_ok
-    end
-end
+# Precompile directives generated by SnoopCompile
+# Regenerate with: mpiexec -n 1 julia --project=. scripts/generate_precompile.jl
+include("precompile.jl")
 
 end # module LinearAlgebraMPI
diff --git a/src/precompile.jl b/src/precompile.jl
diff --git a/src/sparse.jl b/src/sparse.jl