sloisel
diff --git a/‎.github/workflows/CI.yml‎
Lines changed: 8 additions & 6 deletions b/‎.github/workflows/CI.yml‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎tools/benchmark_cpu_vs_gpu.jl‎
Lines changed: 107 additions & 0 deletions b/‎tools/benchmark_cpu_vs_gpu.jl‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎tools/debug_iteration.jl‎
Lines changed: 82 additions & 0 deletions b/‎tools/debug_iteration.jl‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎tools/debug_map.jl‎
Lines changed: 36 additions & 0 deletions b/‎tools/debug_map.jl‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎tools/julia_profile.jl‎
Lines changed: 37 additions & 0 deletions b/‎tools/julia_profile.jl‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎tools/julia_profile2.jl‎
Lines changed: 37 additions & 0 deletions b/‎tools/julia_profile2.jl‎
Lines changed: 37 additions & 0 deletions
@@ -49,9 +49,10 @@ jobs:
           julia --project=. -e '
             using Pkg
             Pkg.add(name="OpenSSL_jll", version="3.0")
-            # Install from GitHub (with MPI fixes)
-            Pkg.add(url="https://github.com/sloisel/LinearAlgebraMPI.jl.git")
-            Pkg.add(url="https://github.com/sloisel/MultiGridBarrier.jl.git")
+            # Install from GitHub main branch (with MPI fixes)
+            # Use rev="main" to get latest commit, not just tagged version
+            Pkg.add(url="https://github.com/sloisel/LinearAlgebraMPI.jl.git", rev="main")
+            Pkg.add(url="https://github.com/sloisel/MultiGridBarrier.jl.git", rev="main")
             Pkg.instantiate()
             Pkg.build("PyCall")
           '
@@ -111,9 +112,10 @@ jobs:
             using Pkg
             Pkg.develop(PackageSpec(path=pwd()))
             Pkg.add(name="OpenSSL_jll", version="3.0")
-            # Install from GitHub (with MPI fixes)
-            Pkg.add(url="https://github.com/sloisel/LinearAlgebraMPI.jl.git")
-            Pkg.add(url="https://github.com/sloisel/MultiGridBarrier.jl.git")
+            # Install from GitHub main branch (with MPI fixes)
+            # Use rev="main" to get latest commit, not just tagged version
+            Pkg.add(url="https://github.com/sloisel/LinearAlgebraMPI.jl.git", rev="main")
+            Pkg.add(url="https://github.com/sloisel/MultiGridBarrier.jl.git", rev="main")
             Pkg.instantiate()
             Pkg.build("PyCall")
           '
 
@@ -0,0 +1,107 @@
+#!/usr/bin/env julia
+#
+# Benchmark: CPU vs Auto (GPU with size threshold) for fem2d_mpi_solve
+#
+# Run with:
+#   mpiexec -n 1 julia --project=MultiGridBarrierMPI.jl MultiGridBarrierMPI.jl/tools/benchmark_cpu_vs_gpu.jl
+#
+# Note: Metal only supports Float32, so we use Float32 for both CPU and GPU
+#       to ensure a fair comparison.
+#
+# Two modes:
+#   - CPU: Pure CPU (no backend parameter)
+#   - Auto: Automatic GPU/CPU selection based on GPU_MIN_SIZE threshold
+
+using MPI
+MPI.Init()
+
+comm = MPI.COMM_WORLD
+rank = MPI.Comm_rank(comm)
+
+println("Loading packages...")
+using Metal
+using MultiGridBarrierMPI
+using MultiGridBarrier
+using LinearAlgebraMPI
+using LinearAlgebraMPI: GPU_MIN_SIZE
+using BenchmarkTools
+using Printf
+
+# Configurable threshold for "Auto" mode
+const AUTO_THRESHOLD = 1000
+
+println("\n" * "="^70)
+println("Benchmark: fem2d_mpi_solve - CPU vs Auto")
+println("  MPI ranks: $(MPI.Comm_size(comm))")
+println("  Element type: Float32 (Metal requirement)")
+println("  Auto threshold: GPU_MIN_SIZE = $AUTO_THRESHOLD")
+println("  Running L = 1:6")
+println("="^70)
+
+# Store results
+results = Vector{NamedTuple}()
+
+for L in 1:6
+    # Get grid size
+    g = fem2d(Float32; L=L)
+    n = size(g.x, 1)
+
+    println("\n--- L = $L (n = $n) ---")
+
+    # Benchmark CPU (pure CPU, no backend)
+    println("  Benchmarking CPU...")
+    LinearAlgebraMPI.clear_plan_cache!()
+    b_cpu = @benchmark fem2d_mpi_solve(Float32; L=$L, verbose=false) samples=1 evals=1
+    cpu_time = median(b_cpu.times) / 1e9
+
+    # Benchmark Auto (GPU_MIN_SIZE threshold)
+    println("  Benchmarking Auto (threshold=$AUTO_THRESHOLD)...")
+    LinearAlgebraMPI.clear_plan_cache!()
+    GPU_MIN_SIZE[] = AUTO_THRESHOLD
+    b_auto = @benchmark fem2d_mpi_solve(Float32; L=$L, backend=LinearAlgebraMPI.mtl, verbose=false) samples=1 evals=1
+    auto_time = median(b_auto.times) / 1e9
+
+    # Determine which arrays went to GPU in auto mode
+    GPU_MIN_SIZE[] = AUTO_THRESHOLD
+    g_test = fem2d_mpi(Float32; L=L, backend=LinearAlgebraMPI.mtl)
+    auto_is_gpu = !(g_test.x.A isa Matrix)
+
+    push!(results, (L=L, n=n, cpu=cpu_time, auto=auto_time, auto_gpu=auto_is_gpu))
+
+    # Print results
+    speedup = cpu_time / auto_time
+    println("  CPU:  $(round(cpu_time, digits=3))s")
+    println("  Auto: $(round(auto_time, digits=3))s [$(auto_is_gpu ? "GPU" : "CPU")]")
+    if speedup > 1
+        println("  Speedup: $(round(speedup, digits=2))x (Auto faster)")
+    else
+        println("  Speedup: $(round(1/speedup, digits=2))x (CPU faster)")
+    end
+end
+
+# Summary table
+println("\n" * "="^70)
+println("Summary")
+println("="^70)
+println("\n  L       n         CPU        Auto    Speedup   Auto backend")
+println("  -       -         ---        ----    -------   ------------")
+for r in results
+    n_str = lpad(r.n, 7)
+    cpu_str = @sprintf("%6.3fs", r.cpu)
+    auto_str = @sprintf("%6.3fs", r.auto)
+
+    speedup = r.cpu / r.auto
+    if speedup > 1
+        speedup_str = @sprintf("%.2fx Auto", speedup)
+    else
+        speedup_str = @sprintf("%.2fx CPU", 1/speedup)
+    end
+    speedup_str = lpad(speedup_str, 10)
+
+    auto_backend = r.auto_gpu ? "GPU" : "CPU"
+    println("  $(r.L)    $n_str    $cpu_str    $auto_str    $speedup_str       $auto_backend")
+end
+
+println("\n  Auto threshold: GPU_MIN_SIZE = $AUTO_THRESHOLD")
+println("  Speedup = CPU time / Auto time (>1 means Auto is faster)")
+println("="^70)
@@ -0,0 +1,82 @@
+#!/usr/bin/env julia
+using MPI
+MPI.Init()
+
+using MultiGridBarrierMPI
+using LinearAlgebraMPI
+using LinearAlgebraMPI: VectorMPI, MatrixMPI, _local_rows, VectorMPI_local
+
+MultiGridBarrierMPI.Init()
+
+g = fem2d_mpi(Float64; L=6)
+x = g.x
+w = g.w
+
+println("x type: ", typeof(x))
+println("w type: ", typeof(w))
+
+row_iters = (_local_rows(x), _local_rows(w))
+println("\nrow_iters types:")
+println("  _local_rows(x): ", typeof(row_iters[1]))
+println("  _local_rows(w): ", typeof(row_iters[2]))
+
+println("\nFirst few items from zip(row_iters...):")
+for (i, items) in enumerate(zip(row_iters...))
+    println("  $i: types = $(typeof.(items))")
+    if i >= 3
+        break
+    end
+end
+
+f = (row_x, w) -> w * sum(row_x)
+
+println("\nTiming comprehension vs loop:")
+
+# Comprehension with zip
+t1 = time_ns()
+results1 = [f(rows...) for rows in zip(row_iters...)]
+t1 = (time_ns() - t1) / 1000
+println("Comprehension: $(round(t1, digits=1)) μs, length=$(length(results1))")
+
+# Simple loop
+row_iters2 = (_local_rows(x), _local_rows(w))
+t2 = time_ns()
+results2 = Float64[]
+for rows in zip(row_iters2...)
+    push!(results2, f(rows...))
+end
+t2 = (time_ns() - t2) / 1000
+println("Simple loop: $(round(t2, digits=1)) μs, length=$(length(results2))")
+
+# Direct access loop
+local_x = x.A
+local_w = w.v
+t3 = time_ns()
+results3 = Vector{Float64}(undef, size(local_x, 1))
+for i in 1:size(local_x, 1)
+    results3[i] = f(view(local_x, i, :), local_w[i])
+end
+t3 = (time_ns() - t3) / 1000
+println("Direct loop: $(round(t3, digits=1)) μs, length=$(length(results3))")
+
+# Using map with eachrow and vector
+t4 = time_ns()
+results4 = collect(map(f, eachrow(local_x), local_w))
+t4 = (time_ns() - t4) / 1000
+println("map(f, eachrow, v): $(round(t4, digits=1)) μs, length=$(length(results4))")
+
+# What about map with _local_rows?
+row_iters3 = (_local_rows(x), _local_rows(w))
+t5 = time_ns()
+results5 = collect(map(f, row_iters3...))
+t5 = (time_ns() - t5) / 1000
+println("map(f, _local_rows...): $(round(t5, digits=1)) μs, length=$(length(results5))")
+
+# Now time VectorMPI_local
+println("\nTiming VectorMPI_local:")
+t6 = time_ns()
+v = VectorMPI_local(results3)
+t6 = (time_ns() - t6) / 1000
+println("VectorMPI_local: $(round(t6, digits=1)) μs")
+
+println("\nDone.")
@@ -0,0 +1,36 @@
+#!/usr/bin/env julia
+using MPI
+MPI.Init()
+
+using MultiGridBarrierMPI
+using LinearAlgebraMPI
+using LinearAlgebraMPI: VectorMPI, MatrixMPI, _local_rows
+
+MultiGridBarrierMPI.Init()
+
+g = fem2d_mpi(Float64; L=4)
+x = g.x
+w = g.w
+
+println("x type: ", typeof(x))
+println("w type: ", typeof(w))
+
+row_iters = (_local_rows(x), _local_rows(w))
+println("\nrow_iters types:")
+println("  _local_rows(x): ", typeof(row_iters[1]))
+println("  _local_rows(w): ", typeof(row_iters[2]))
+
+f = (row_x, w) -> w * sum(row_x)
+
+println("\nTrying map(f, row_iters...):")
+result = collect(map(f, row_iters...))
+println("Result type: ", typeof(result))
+println("Result length: ", length(result))
+println("First 5 elements: ", result[1:min(5, length(result))])
+
+println("\nTrying with eachrow and vector directly:")
+result2 = collect(map(f, eachrow(x.A), w.v))
+println("Result2 type: ", typeof(result2))
+println("Result2 length: ", length(result2))
+
+println("\nDone.")
@@ -0,0 +1,37 @@
+#!/usr/bin/env julia
+# Use Julia's built-in profiler to find bottlenecks
+using MPI
+MPI.Init()
+
+using Profile
+using MultiGridBarrier
+using MultiGridBarrierMPI
+using LinearAlgebraMPI
+
+MultiGridBarrierMPI.Init()
+
+const L = 6
+
+println("="^70)
+println("Julia profiling at L=$L")
+println("="^70)
+
+# Create geometry
+g_mpi = fem2d_mpi(Float64; L=L)
+println("Grid points: ", sum(g_mpi.x.row_partition) - 2)
+
+# Warmup
+println("Warmup...")
+MultiGridBarrier.amgb(g_mpi; verbose=false, tol=0.1)
+
+# Profile the solve
+println("Profiling MPI solve...")
+Profile.clear()
+@profile MultiGridBarrier.amgb(g_mpi; verbose=false, tol=0.1)
+
+# Print profile
+println("\nTop 30 by flat count:")
+Profile.print(format=:flat, sortedby=:count, mincount=100, maxdepth=30)
+
+println("\n" * "="^70)
+println("Done.")
@@ -0,0 +1,37 @@
+#!/usr/bin/env julia
+# Use Julia's built-in profiler with tree view
+using MPI
+MPI.Init()
+
+using Profile
+using MultiGridBarrier
+using MultiGridBarrierMPI
+using LinearAlgebraMPI
+
+MultiGridBarrierMPI.Init()
+
+const L = 6
+
+println("="^70)
+println("Julia profiling at L=$L (tree view)")
+println("="^70)
+
+# Create geometry
+g_mpi = fem2d_mpi(Float64; L=L)
+println("Grid points: ", sum(g_mpi.x.row_partition) - 2)
+
+# Warmup
+println("Warmup...")
+MultiGridBarrier.amgb(g_mpi; verbose=false, tol=0.1)
+
+# Profile the solve
+println("Profiling MPI solve...")
+Profile.clear()
+@profile MultiGridBarrier.amgb(g_mpi; verbose=false, tol=0.1)
+
+# Print tree view (collapsed)
+println("\nTree view (mincount=200):")
+Profile.print(format=:tree, mincount=200, maxdepth=25)
+
+println("\n" * "="^70)
+println("Done.")