diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml
index 91ffa9376..1fe10f043 100644
--- a/.github/workflows/Tests.yml
+++ b/.github/workflows/Tests.yml
@@ -34,6 +34,7 @@ jobs:
           - "Core"
           - "DefaultsLoading"
           - "LinearSolveHYPRE"
+          - "LinearSolvePETSc"
           - "LinearSolvePardiso"
           - "LinearSolveGinkgo"
           - "NoPre"
diff --git a/Project.toml b/Project.toml
index a31972a96..3523eff40 100644
--- a/Project.toml
+++ b/Project.toml
@@ -126,7 +126,7 @@ Metal = "1.4"
 Mooncake = "0.4, 0.5"
 MultiFloats = "2.3, 3"
 OpenBLAS_jll = "0.3"
-PETSc = "0.4.2"
+PETSc = "0.4.6"
 ParU_jll = "1"
 Pardiso = "1"
 Pkg = "1.10"
diff --git a/ext/LinearSolvePETScExt.jl b/ext/LinearSolvePETScExt.jl
index a4505fae1..b8257a1f8 100644
--- a/ext/LinearSolvePETScExt.jl
+++ b/ext/LinearSolvePETScExt.jl
@@ -1,183 +1,516 @@
 module LinearSolvePETScExt
 
-using LinearAlgebra
 using PETSc
-using PETSc: MPI
-using PETSc: petsclibs
-using SparseArrays: SparseMatrixCSC, sparse
-using LinearSolve: PETScAlgorithm, LinearCache, LinearProblem, LinearSolve,
-    OperatorAssumptions, default_tol, init_cacheval, __issquare,
-    __conditioning, LinearSolveAdjoint, LinearVerbosity
-using SciMLLogging: SciMLLogging, verbosity_to_int, @SciMLMessage
-using SciMLBase: LinearProblem, LinearAliasSpecifier, SciMLBase
-using Setfield: @set!
-
-mutable struct PETScCache
-    ksp::Any  # PETSc KSP object or nothing
+import PETSc: MPI, LibPETSc
+using SparseArrays: SparseMatrixCSC, nzrange, sparse
+using LinearSolve: PETScAlgorithm, LinearCache, LinearSolve,
+    OperatorAssumptions, LinearVerbosity
+using SciMLBase: LinearSolution, build_linear_solution, ReturnCode, SciMLBase
+
+# ── MPI communicator ──────────────────────────────────────────────────────────
+
+# Only MPI.COMM_SELF (serial) is supported. MPI-parallel solves are planned
+# for a future release.
+function resolve_comm(alg::PETScAlgorithm)
+    comm = alg.comm === nothing ? MPI.COMM_SELF : alg.comm
+    comm == MPI.COMM_SELF || error(
+        "PETScAlgorithm currently only supports MPI.COMM_SELF (serial solves). " *
+            "Pass `comm = nothing` or `comm = MPI.COMM_SELF` to use serial mode."
+    )
+    return comm
+end
+
+# Select the PETSc C library that matches the scalar type of the system matrix.
+get_petsclib(::Type{T} = Float64) where {T} = PETSc.getlib(; PetscScalar = T)
+
+# ── Sparsity pattern check ────────────────────────────────────────────────────
+
+function sparsity_pattern_changed(old_colptr, old_rowval, new_A::SparseMatrixCSC)
+    # 1. Quick length checks
+    if length(old_colptr) != length(new_A.colptr) || length(old_rowval) != length(new_A.rowval)
+        return true
+    end
+
+    # 2. Vectorized comparison (very fast in Julia)
+    # This checks both sizes and element-wise equality
+    return old_colptr != new_A.colptr || old_rowval != new_A.rowval
+end
+
+# ── Cache ─────────────────────────────────────────────────────────────────────
+
+"""
+    PETScCache{T}
+
+Owns all PETSc C-side objects for one LinearSolve cache instance.
+`T` is the PETSc scalar type (must match `eltype` of the system matrix).
+
+Fields
+──────
+- `ksp`            — KSP (Krylov subspace) solver object.
+- `petsclib`       — PETSc library handle; selects the scalar-type-specific C library.
+- `comm`           — MPI communicator (always `COMM_SELF` in this implementation).
+- `petsc_A`        — System matrix in PETSc format.
+- `petsc_P`        — Preconditioner matrix.  Aliases `petsc_A` when no separate
+                     `prec_matrix` is provided (same pointer, not a copy).
+- `nullspace_obj`  — `MatNullSpace` object, or `nothing` if unused.
+- `petsc_x`        — Solution vector.
+- `petsc_b`        — Right-hand side vector.
+- `vec_n`          — Length of the currently allocated vectors; used to detect resizing.
+- `prev_colptr`    — Previous matrix's colptr array (for sparse matrices).
+- `prev_rowval`    — Previous matrix's rowval array (for sparse matrices).
+- `prev_size`      — Previous matrix's size (for dense matrices).
+- `sparse_perm`    — Permutation vector of length `nnz` mapping PETSc's internal
+                     CSR index to Julia's CSC `nzval` index.  Allocated once per
+                     KSP build and reused on every Case 3 update with zero allocations.
+- `sparse_scratch` — Scratch buffer of length `2*(m+1)` used by `_csc_to_csr_perm!`
+                     to avoid allocation during permutation construction.
+- `initialized`    — `true` once `PETSc.initialize` has been called for this cache.
+"""
+mutable struct PETScCache{T}
+    ksp::Any
     petsclib::Any
-    A::Any    # PETSc Mat or nothing
-    b::Any    # PETSc Vec or nothing
-    u::Any    # PETSc Vec or nothing
+    comm::Union{MPI.Comm, Nothing}
+    petsc_A::Any
+    petsc_P::Any
+    nullspace_obj::Any
+    petsc_x::Any
+    petsc_b::Any
+    vec_n::Int
+    prev_colptr::Union{Vector{Int}, Nothing}
+    prev_rowval::Union{Vector{Int}, Nothing}
+    prev_size::Union{NTuple{2, Int}, Nothing}
+    sparse_perm::Union{Vector{Int}, Nothing}
+    sparse_scratch::Union{Vector{Int}, Nothing}
     initialized::Bool
 end
 
+# Reset all fields to their zero/nothing state.
+# Only called after C-side PETSc objects have already been destroyed.
+function _nullify_all!(pcache::PETScCache)
+    pcache.ksp = pcache.petsc_A = pcache.petsc_P = pcache.nullspace_obj = nothing
+    pcache.petsc_x = pcache.petsc_b = nothing
+    pcache.sparse_perm = pcache.sparse_scratch = nothing
+    pcache.vec_n = 0
+    pcache.prev_colptr = pcache.prev_rowval = pcache.prev_size = nothing
+    return pcache.initialized = false
+end
+
+"""
+    cleanup_petsc_cache!(pcache::PETScCache)
+    cleanup_petsc_cache!(cache::LinearCache)
+    cleanup_petsc_cache!(sol::LinearSolution)
+
+Destroy all PETSc objects owned by `pcache` and reset its state to empty.
+Safe to call multiple times — subsequent calls after the first are no-ops.
+
+Prefer calling this explicitly for deterministic resource release; the cache
+also registers a GC finalizer as a safety net.
+
+Destruction order matters because PETSc objects hold internal references:
+  1. KSP — holds references to `petsc_A` and `petsc_P`.
+  2. MatNullSpace — attached to `petsc_A`.
+  3. Vectors — independent of matrices; order between them does not matter.
+  4. Preconditioner matrix — only if distinct from `petsc_A` (avoid double-free).
+  5. System matrix — destroyed last, once nothing else references it.
+"""
+function cleanup_petsc_cache!(pcache::PETScCache)
+    if pcache.petsclib === nothing ||
+            (pcache.initialized && !PETSc.initialized(pcache.petsclib))
+        _nullify_all!(pcache)
+        return
+    end
+    try
+        pcache.ksp !== nothing && PETSc.destroy(pcache.ksp)
+        pcache.nullspace_obj !== nothing &&
+            LibPETSc.MatNullSpaceDestroy(pcache.petsclib, pcache.nullspace_obj)
+        pcache.petsc_x !== nothing && PETSc.destroy(pcache.petsc_x)
+        pcache.petsc_b !== nothing && PETSc.destroy(pcache.petsc_b)
+        pcache.petsc_P !== nothing && pcache.petsc_P !== pcache.petsc_A &&
+            PETSc.destroy(pcache.petsc_P)
+        pcache.petsc_A !== nothing && PETSc.destroy(pcache.petsc_A)
+    catch
+        # Swallow errors — cleanup may be called from a GC finalizer where
+        # throwing is unsafe.
+    end
+    return _nullify_all!(pcache)
+end
+
+cleanup_petsc_cache!(cache::LinearCache) = cleanup_petsc_cache!(cache.cacheval)
+cleanup_petsc_cache!(sol::LinearSolution) = cleanup_petsc_cache!(sol.cache.cacheval)
+
+# ── Cache initialisation ──────────────────────────────────────────────────────
+
+# Called by LinearSolve.init to allocate the solver-specific cache.
+# An empty shell is returned here; actual PETSc objects are created lazily on
+# the first solve! call so PETSc is never initialised unless actually used.
 function LinearSolve.init_cacheval(
         alg::PETScAlgorithm, A, b, u, Pl, Pr, maxiters::Int,
-        abstol, reltol,
-        verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions
+        abstol, reltol, verbose::Union{LinearVerbosity, Bool},
+        assumptions::OperatorAssumptions
     )
-    return PETScCache(nothing, nothing, nothing, nothing, nothing, false)
+    T = eltype(A)
+    pcache = PETScCache{T}(
+        nothing, nothing, nothing, nothing, nothing, nothing,
+        nothing, nothing, 0, nothing, nothing, nothing, nothing, nothing, false
+    )
+    finalizer(cleanup_petsc_cache!, pcache)
+    return pcache
+end
+
+# ── Matrix conversion ─────────────────────────────────────────────────────────
+
+# Convert a Julia matrix to a PETSc sequential matrix.
+#
+# Sparse (SparseMatrixCSC) → MatSeqAIJWithArrays :
+#   Internally converts CSR format, so quite inefficient to construct from CSC.
+#
+# Dense (AbstractMatrix) → MatSeqDense:
+#   PETSc allocates its own buffer and copies the values in on construction.
+#   This protects the Julia array even when a factorising preconditioner is
+#   used without a separate `prec_matrix`.
+function to_petsc_mat(petsclib, A)
+    if A isa SparseMatrixCSC
+        return PETSc.MatSeqAIJWithArrays(petsclib, MPI.COMM_SELF, A)
+    else
+        return PETSc.MatSeqDense(petsclib, A)
+    end
+end
+
+# ── CSC → CSR permutation ─────────────────────────────────────────────────────
+#
+# PETSc stores sparse matrix values in CSR (row-major) order internally, while
+# Julia's SparseMatrixCSC uses CSC (column-major) order.  To bulk-copy nzval
+# into PETSc's internal buffer via MatSeqAIJGetArray, we need a permutation
+# perm such that:
+#
+#   petsc_vals[csr_idx] = A.nzval[perm[csr_idx]]
+#
+# scratch must have length 2*(m+1) and is used as two consecutive views:
+#   scratch[1 : m+1]       → row_ptr   (CSR row start positions, 1-based)
+#   scratch[m+2 : 2*(m+1)] → fill_pos  (current write cursor per row)
+#
+# Walking the CSC columns in ascending order guarantees that within each row,
+# columns are visited in ascending order — matching PETSc's sorted-column
+# requirement for CSR storage.
+#
+# This is mimicking MatSeqAIJWithArrays without rebuilding the matrix itself.
+# This is currently the limiting factor for computing large system.
+function _csc_to_csr_perm!(perm::Vector{Int}, scratch::Vector{Int}, A::SparseMatrixCSC)
+    m, n = size(A)
+    row_ptr = view(scratch, 1:(m + 1))
+    fill_pos = view(scratch, (m + 2):(2 * (m + 1)))
+
+    fill!(row_ptr, 0)
+    # Pass 1: Count
+    @inbounds for i in 1:length(A.rowval)
+        row_ptr[A.rowval[i] + 1] += 1
+    end
+
+    # Pass 2: Cumulative sum (The "Compact" way)
+    count = 1
+    @inbounds for r in 1:m
+        tmp = row_ptr[r + 1]
+        row_ptr[r] = count
+        count += tmp
+    end
+    row_ptr[m + 1] = count
+
+    copyto!(fill_pos, row_ptr)
+
+    # Pass 3: Map
+    @inbounds for col in 1:n
+        for csc_idx in nzrange(A, col)
+            r = A.rowval[csc_idx]
+            perm[fill_pos[r]] = csc_idx
+            fill_pos[r] += 1
+        end
+    end
+    return perm
 end
 
-# Helper function to get or initialize the PETSc library
-function get_petsclib(T::Type = Float64)
-    # Find a petsclib that matches our scalar type
-    for lib in PETSc.petsclibs
-        if PETSc.scalartype(lib) === T
-            return lib
+# ── In-place matrix value updates (Case 3) ────────────────────────────────────
+
+# Sparse: bulk-copy nzval into PETSc's internal CSR buffer in a single C call.
+# The pre-computed permutation maps each CSR index to the corresponding CSC
+# nzval index, so the inner loop is a plain indexed array read — no FFI
+# overhead per element, no allocation.
+# assemble! bumps PETSc's modification counter, triggering preconditioner
+# recomputation on the next KSPSolve without an explicit KSPSetOperators call.
+function update_mat_values!(petsclib, PA, A::SparseMatrixCSC, perm::Vector{Int}; assemble::Bool = true)
+    vals = LibPETSc.MatSeqAIJGetArray(petsclib, PA)
+    nzval = A.nzval
+    try
+        @inbounds for i in eachindex(vals)
+            vals[i] = nzval[perm[i]]
         end
+    finally
+        LibPETSc.MatSeqAIJRestoreArray(petsclib, PA, vals)
+    end
+    assemble && PETSc.assemble!(PA)
+    return nothing
+end
+
+# Dense: MatDenseGetArray wraps PETSc's column-major buffer as a Julia Array.
+# copyto! is equivalent to a single memcpy — O(n²) with negligible constant.
+function update_mat_values!(petsclib, PA, A::AbstractMatrix)
+    ptr = LibPETSc.MatDenseGetArray(petsclib, PA)
+    try
+        copyto!(ptr, A)
+    finally
+        LibPETSc.MatDenseRestoreArray(petsclib, PA, ptr)
+    end
+    return PETSc.assemble!(PA)
+end
+
+# ── Vector I/O ────────────────────────────────────────────────────────────────
+#
+# VecGetArray / VecGetArrayRead return Julia Arrays wrapping PETSc's internal
+# memory — no extra allocation.  The matching Restore call is mandatory and
+# must always be reached (hence the try/finally pattern).
+#
+# Note: VecAssemblyBegin/End is NOT needed after direct pointer writes.
+# Assembly is only required after VecSetValues, which uses an off-process
+# communication stash.  Direct pointer access bypasses the stash entirely.
+
+function write_vec!(pv, src::AbstractVector, petsclib)
+    ptr = LibPETSc.VecGetArray(petsclib, pv)
+    return try
+        copyto!(ptr, src) # Use vectorized copy
+    finally
+        LibPETSc.VecRestoreArray(petsclib, pv, ptr)
     end
-    # Fallback to first available
-    return first(PETSc.petsclibs)
 end
 
-# Convert Julia sparse matrix to PETSc matrix
-function to_petsc_mat(petsclib, A::SparseMatrixCSC{T}) where {T}
-    return PETSc.MatCreateSeqAIJ(petsclib, MPI.COMM_SELF, A)
+function read_vec!(dst::AbstractVector, pv, petsclib)
+    ptr = LibPETSc.VecGetArrayRead(petsclib, pv)
+    return try
+        copyto!(dst, ptr)
+    finally
+        LibPETSc.VecRestoreArrayRead(petsclib, pv, ptr)
+    end
 end
 
-# Convert Julia dense matrix to PETSc matrix
-function to_petsc_mat(petsclib, A::AbstractMatrix{T}) where {T}
-    return PETSc.MatSeqDense(petsclib, Matrix{T}(A))
+# Allocate a PETSc sequential Vec of length n and fill it from src.
+function create_seq_vec(petsclib, src::AbstractVector)
+    pv = LibPETSc.VecCreateSeq(petsclib, MPI.COMM_SELF, petsclib.PetscInt(length(src)))
+    write_vec!(pv, src, petsclib)
+    return pv
 end
 
-# Convert Julia vector to PETSc vector
-function to_petsc_vec(petsclib, v::AbstractVector{T}) where {T}
-    return PETSc.VecSeq(petsclib, MPI.COMM_SELF, Vector{T}(v))
+# Recreate petsc_x and petsc_b only when the problem size changes.
+function ensure_vecs!(pcache, petsclib, u, b)
+    n = length(b)
+    if pcache.vec_n != n || pcache.petsc_x === nothing || pcache.petsc_b === nothing
+        pcache.petsc_x !== nothing && PETSc.destroy(pcache.petsc_x)
+        pcache.petsc_b !== nothing && PETSc.destroy(pcache.petsc_b)
+        pcache.petsc_x = create_seq_vec(petsclib, u)
+        pcache.petsc_b = create_seq_vec(petsclib, b)
+        pcache.vec_n = n
+    end
+    return pcache.petsc_x, pcache.petsc_b
 end
 
-# Symbol to PETSc KSP type string
-function ksp_type_string(solver_type::Symbol)
-    ksp_types = Dict(
-        :gmres => "gmres",
-        :cg => "cg",
-        :bicg => "bicg",
-        :bcgs => "bcgs",
-        :bcgsl => "bcgsl",
-        :cgs => "cgs",
-        :tfqmr => "tfqmr",
-        :cr => "cr",
-        :gcr => "gcr",
-        :preonly => "preonly",
-        :richardson => "richardson",
-        :chebyshev => "chebyshev",
-        :minres => "minres",
-        :symmlq => "symmlq",
-        :lgmres => "lgmres",
-        :fgmres => "fgmres",
+# ── Null space ────────────────────────────────────────────────────────────────
+
+# Build a PETSc MatNullSpace from alg.nullspace:
+#
+#   :none     — no null-space handling; returns nothing.
+#   :constant — the constant vector spans the null space (e.g. pressure in
+#               pure-Neumann incompressible flow). PETSc constructs it internally.
+#   :custom   — caller supplies an explicit orthonormal basis via alg.nullspace_vecs.
+#               PETSc copies the vectors on creation, so the temporary wrappers
+#               can be destroyed immediately after MatNullSpaceCreate returns.
+function build_nullspace(petsclib, alg::PETScAlgorithm)
+    alg.nullspace === :none     && return nothing
+    alg.nullspace === :constant && return LibPETSc.MatNullSpaceCreate(
+        petsclib, MPI.COMM_SELF, LibPETSc.PetscBool(true), 0, LibPETSc.PetscVec[]
     )
-    return get(ksp_types, solver_type, string(solver_type))
-end
-
-# Symbol to PETSc PC type string
-function pc_type_string(pc_type::Symbol)
-    pc_types = Dict(
-        :none => "none",
-        :jacobi => "jacobi",
-        :sor => "sor",
-        :lu => "lu",
-        :ilu => "ilu",
-        :icc => "icc",
-        :cholesky => "cholesky",
-        :gamg => "gamg",
-        :hypre => "hypre",
-        :bjacobi => "bjacobi",
-        :asm => "asm",
-        :mg => "mg",
+    # :custom
+    PScalar = petsclib.PetscScalar
+    petsc_vecs = LibPETSc.PetscVec[
+        create_seq_vec(petsclib, eltype(v) == PScalar ? v : PScalar.(v))
+            for v in alg.nullspace_vecs
+    ]
+    ns = LibPETSc.MatNullSpaceCreate(
+        petsclib, MPI.COMM_SELF, LibPETSc.PetscBool(false),
+        length(petsc_vecs), petsc_vecs
     )
-    return get(pc_types, pc_type, string(pc_type))
+    foreach(PETSc.destroy, petsc_vecs)
+    return ns
 end
 
-function SciMLBase.solve!(cache::LinearCache, alg::PETScAlgorithm, args...; kwargs...)
-    pcache = cache.cacheval
+function attach_nullspace!(petsclib, petsc_A, ns)
+    ns === nothing && return
+    return LibPETSc.MatSetNullSpace(petsclib, petsc_A, ns)
+end
 
-    # Get element type from the problem
-    T = eltype(cache.A)
+# ── Full KSP construction ─────────────────────────────────────────────────────
 
-    # Initialize PETSc if needed
-    if !pcache.initialized
-        petsclib = get_petsclib(T)
-        if !PETSc.initialized(petsclib)
-            PETSc.initialize(petsclib)
+# Build PETSc matrices, configure the KSP solver and preconditioner, and
+# optionally attach a null space.  Called on Case 1 (first solve) and Case 2
+# (structure changed).
+#
+# For sparse matrices, the CSC→CSR permutation is computed here once and stored
+# in pcache for reuse on all subsequent Case 3 value-only updates. The buffers
+# are only reallocated when nnz changes (which implies a Case 2 rebuild anyway).
+#
+# vec_n is reset to 0 so ensure_vecs! unconditionally recreates petsc_x and
+# petsc_b after every rebuild — required when the problem size changes.
+function build_ksp!(pcache, petsclib, cache, alg)
+    pcache.vec_n = 0
+    pcache.petsc_A = to_petsc_mat(petsclib, cache.A)
+
+    if cache.A isa SparseMatrixCSC
+        nnz = length(cache.A.nzval)
+        m = size(cache.A, 1)
+        # Reallocate only when nnz changes (implies a structural change).
+        if pcache.sparse_perm === nothing || length(pcache.sparse_perm) != nnz
+            pcache.sparse_perm = Vector{Int}(undef, nnz)
+            pcache.sparse_scratch = Vector{Int}(undef, 2 * (m + 1))
         end
-        pcache.petsclib = petsclib
-        pcache.initialized = true
+        _csc_to_csr_perm!(pcache.sparse_perm, pcache.sparse_scratch, cache.A)
+        # Store the current sparsity pattern
+        pcache.prev_colptr = cache.A.colptr
+        pcache.prev_rowval = cache.A.rowval
+    else
+        pcache.sparse_perm = nothing
+        pcache.sparse_scratch = nothing
+        pcache.prev_size = size(cache.A)
     end
 
-    petsclib = pcache.petsclib
+    # petsc_P aliases petsc_A when no separate preconditioner matrix is given.
+    pcache.petsc_P = alg.prec_matrix === nothing ?
+        pcache.petsc_A : to_petsc_mat(petsclib, alg.prec_matrix)
 
-    # Convert matrix if needed
-    if pcache.A === nothing || cache.isfresh
-        pcache.A = to_petsc_mat(petsclib, cache.A)
-    end
+    pcache.ksp = PETSc.KSP(
+        pcache.petsc_A, pcache.petsc_P;
+        ksp_type = string(alg.solver_type),
+        pc_type = string(alg.pc_type),
+        ksp_rtol = cache.reltol,
+        ksp_atol = cache.abstol,
+        ksp_max_it = cache.maxiters,
+        alg.ksp_options...
+    )
 
-    # Convert vectors
-    pcache.b = to_petsc_vec(petsclib, cache.b)
-    pcache.u = to_petsc_vec(petsclib, copy(cache.u))
+    alg.initial_guess_nonzero &&
+        LibPETSc.KSPSetInitialGuessNonzero(petsclib, pcache.ksp, LibPETSc.PetscBool(true))
 
-    # Create KSP solver if needed
-    if pcache.ksp === nothing
-        ksp = PETSc.KSP(petsclib, pcache.A; ksp_rtol = cache.reltol, ksp_atol = cache.abstol)
+    pcache.nullspace_obj = build_nullspace(petsclib, alg)
+    return attach_nullspace!(petsclib, pcache.petsc_A, pcache.nullspace_obj)
+end
 
-        # Set KSP type
-        ksp_type = ksp_type_string(alg.solver_type)
-        PETSc.settype!(ksp, ksp_type)
+# ── solve! ────────────────────────────────────────────────────────────────────
 
-        # Set PC type
-        if alg.pc_type !== :none
-            pc = PETSc.PC(ksp)
-            pc_type = pc_type_string(alg.pc_type)
-            PETSc.settype!(pc, pc_type)
-        end
+"""
+    solve!(cache::LinearCache, alg::PETScAlgorithm; kwargs...)
 
-        # Set max iterations
-        PETSc.settolerances!(ksp; maxits = cache.maxiters)
+Solve the linear system stored in `cache` using PETSc.
 
-        PETSc.setfromoptions!(ksp)
-        pcache.ksp = ksp
-    else
-        # Update operators if matrix changed
-        if cache.isfresh
-            PETSc.setoperators!(pcache.ksp, pcache.A, pcache.A)
+Three execution paths are chosen based on `cache.isfresh` and the matrix's
+structural fingerprint:
+
+  **Case 1** — first solve (`ksp === nothing`):
+    Build all PETSc objects from scratch.
+
+  **Case 2** — `isfresh`, structure changed (sparse pattern or dense size):
+    Destroy existing PETSc objects and rebuild.
+
+  **Case 3** — `isfresh`, same structure, values changed:
+    Update matrix values in place and reuse the existing KSP.
+    - Sparse: bulk-copies `nzval` into PETSc's internal CSR buffer using a
+      pre-computed CSC→CSR permutation.  Zero allocations on the hot path;
+      one C call in, one O(nnz) indexed loop, one C call out.
+    - Dense: `copyto!` into PETSc's column-major buffer — equivalent to a
+      single `memcpy`.
+
+  **No change** (`isfresh = false`):
+    Only the RHS is updated; all PETSc objects are reused as-is.
+"""
+function SciMLBase.solve!(cache::LinearCache, alg::PETScAlgorithm; kwargs...)
+
+    pcache = cache.cacheval
+    comm = resolve_comm(alg)
+
+    petsclib = pcache.petsclib === nothing ? get_petsclib(eltype(cache.A)) : pcache.petsclib
+    PETSc.initialized(petsclib) || PETSc.initialize(petsclib)
+    pcache.petsclib = petsclib
+    pcache.comm = comm
+    pcache.initialized = true
+
+    # ── Decide: rebuild, update in-place, or reuse ────────────────────────────
+    rebuild_ksp = pcache.ksp === nothing   # Case 1
+    if !rebuild_ksp && cache.isfresh
+        if cache.A isa SparseMatrixCSC
+            if pcache.prev_colptr === nothing || pcache.prev_rowval === nothing
+                rebuild_ksp = true
+            else
+                # Compare stored pointers/arrays directly against the new matrix
+                rebuild_ksp = sparsity_pattern_changed(
+                    pcache.prev_colptr,
+                    pcache.prev_rowval,
+                    cache.A
+                )
+            end
+        else
+            # For dense matrices, check if the size has changed
+            rebuild_ksp = pcache.prev_size !== nothing && size(cache.A) != pcache.prev_size
         end
     end
 
-    cache.isfresh = false
-
-    # Solve
-    PETSc.solve!(pcache.u, pcache.ksp, pcache.b)
+    if rebuild_ksp
+        build_ksp!(pcache, petsclib, cache, alg)
+    elseif cache.isfresh
+        # Case 3: same structure — update values without touching the KSP object.
+        if cache.A isa SparseMatrixCSC
+            update_mat_values!(
+                petsclib, pcache.petsc_A, cache.A, pcache.sparse_perm;
+                assemble = cache.precsisfresh
+            )
+        else
+            update_mat_values!(petsclib, pcache.petsc_A, cache.A)
+        end
+        if alg.prec_matrix !== nothing
+            if alg.prec_matrix isa SparseMatrixCSC
+                # Preconditioner shares the pattern with A — reuse the permutation.
+                update_mat_values!(petsclib, pcache.petsc_P, alg.prec_matrix, pcache.sparse_perm)
+            else
+                update_mat_values!(petsclib, pcache.petsc_P, alg.prec_matrix)
+            end
+        end
 
-    # Copy solution back to Julia vector using withlocalarray!
-    PETSc.withlocalarray!(pcache.u; read = true, write = false) do u_arr
-        copyto!(cache.u, u_arr)
+        # Follow the LinearSolve.jl preconditioner-reuse convention:
+        #   reinit!(cache)                   → recompute preconditioner (default)
+        #   reinit!(cache; reuse_precs=true) → skip recomputation
+        if !cache.precsisfresh
+            LibPETSc.KSPSetReusePreconditioner(petsclib, pcache.ksp, LibPETSc.PetscBool(true))
+        end
     end
+    cache.isfresh = false
 
-    # Get convergence info
-    iters = PETSc.getiterationnumber(pcache.ksp)
-    reason = PETSc.getconvergedreason(pcache.ksp)
-    resid = PETSc.getresidualnorm(pcache.ksp)
+    # ── Vectors ───────────────────────────────────────────────────────────────
+    petsc_x, petsc_b = ensure_vecs!(pcache, petsclib, cache.u, cache.b)
+    write_vec!(petsc_x, cache.u, petsclib)
+    write_vec!(petsc_b, cache.b, petsclib)
 
-    # Determine return code
-    retcode = if reason > 0
-        SciMLBase.ReturnCode.Success
-    elseif reason == 0
-        SciMLBase.ReturnCode.Default
+    # ── Solve ─────────────────────────────────────────────────────────────────
+    if alg.transposed
+        LibPETSc.KSPSolveTranspose(petsclib, pcache.ksp, petsc_b, petsc_x)
     else
-        SciMLBase.ReturnCode.Failure
+        LibPETSc.KSPSolve(petsclib, pcache.ksp, petsc_b, petsc_x)
     end
 
-    stats = nothing
-    return SciMLBase.build_linear_solution(alg, cache.u, resid, cache; retcode, iters, stats)
+    read_vec!(cache.u, petsc_x, petsclib)
+
+    # ── Convergence metadata ──────────────────────────────────────────────────
+    iters = Int(LibPETSc.KSPGetIterationNumber(petsclib, pcache.ksp))
+    reason = Int(LibPETSc.KSPGetConvergedReason(petsclib, pcache.ksp))
+    resid = Float64(LibPETSc.KSPGetResidualNorm(petsclib, pcache.ksp))
+    # reason > 0 → converged; reason == 0 → iteration limit not yet hit;
+    # reason < 0 → diverged or other failure.
+    retcode = reason > 0 ? ReturnCode.Success :
+        reason == 0 ? ReturnCode.Default : ReturnCode.Failure
+
+    return build_linear_solution(alg, cache.u, resid, cache; retcode, iters)
 end
 
-end # module LinearSolvePETScExt
+end
diff --git a/src/extension_algs.jl b/src/extension_algs.jl
index 804b3c85e..f0c9a91f7 100644
--- a/src/extension_algs.jl
+++ b/src/extension_algs.jl
@@ -1,60 +1,138 @@
 # This file only include the algorithm struct to be exported by LinearSolve.jl. The main
 # functionality is implemented as package extensions
-
 """
-`PETScAlgorithm(solver_type = :gmres; kwargs...)`
+    PETScAlgorithm(solver_type = :gmres; kwargs...)
 
-[PETSc.jl](https://github.com/JuliaParallel/PETSc.jl) is a Julia wrapper for the Portable,
-Extensible Toolkit for Scientific Computation (PETSc). This algorithm provides access to
-PETSc's KSP (Krylov Subspace) linear solvers.
+A `LinearSolve.jl` algorithm that wraps PETSc's KSP (Krylov Subspace) linear
+solvers via [PETSc.jl](https://github.com/JuliaParallel/PETSc.jl).
 
-!!! note
+!!! compat
+    Requires `PETSc.jl` and `MPI.jl` to be loaded:
+    ```julia
+    using PETSc, MPI
+    MPI.Init()
+    ```
+
+!!! warning "Serial only"
+    Currently supports only serial solves (`MPI.COMM_SELF`). Passing a
+    multi-rank communicator will raise an error. MPI-parallel support is
+    planned for a future release.
 
-    Using PETSc solvers requires Julia version 1.10 or higher, and that the packages
-    PETSc.jl and MPI.jl are installed: `using PETSc, MPI`
+---
 
 ## Positional Arguments
 
-The single positional argument `solver_type` specifies the KSP solver type. Common choices:
+- `solver_type::Symbol` — PETSc KSP solver type.
+  Common values: `:gmres` (default), `:cg`, `:bcgs`, `:bicg`, `:preonly`,
+  `:richardson`.
 
-  - `:gmres` (default): Generalized Minimal Residual method
-  - `:cg`: Conjugate Gradient (for symmetric positive definite systems)
-  - `:bicg`: BiConjugate Gradient
-  - `:bcgs`: BiConjugate Gradient Stabilized
-  - `:preonly`: Apply preconditioner only (no Krylov iteration)
-  - `:richardson`: Richardson iteration
+---
 
 ## Keyword Arguments
 
-  - `pc_type`: Preconditioner type (e.g., `:jacobi`, `:ilu`, `:lu`, `:gamg`, `:hypre`, `:none`)
-  - `ksp_options`: NamedTuple of additional KSP options
+| Keyword | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `pc_type` | `Symbol` | `:none` | Preconditioner type: `:jacobi`, `:ilu`, `:lu`, `:gamg`, `:hypre`, … |
+| `comm` | `MPI.Comm` | `nothing` | MPI communicator. `nothing` maps to `MPI.COMM_SELF` at solve time. |
+| `nullspace` | `Symbol` | `:none` | Null-space strategy: `:none`, `:constant`, or `:custom`. |
+| `nullspace_vecs` | `Vector` | `nothing` | Orthonormal null-space basis; required when `nullspace = :custom`. |
+| `prec_matrix` | `AbstractMatrix` | `nothing` | Separate matrix used only for building the preconditioner. |
+| `initial_guess_nonzero` | `Bool` | `false` | Use the current solution vector as the initial Krylov guess. |
+| `transposed` | `Bool` | `false` | Solve the transposed system `Aᵀx = b`. |
+| `ksp_options` | `NamedTuple` | `(;)` | Extra PETSc Options Database flags (see table below). |
+
+### Common `ksp_options`
+
+| Option | Description |
+| :--- | :--- |
+| `ksp_monitor = ""` | Print residual norm each iteration. |
+| `ksp_view = ""` | Print solver configuration after setup. |
+| `pc_factor_levels = 2` | Fill levels for ILU. |
+| `log_view = ""` | PETSc performance logging summary. |
+
+---
+
+## Memory Management
+
+PETSc objects live in C-side memory outside Julia's GC. Call
+`cleanup_petsc_cache!` explicitly when finished with a solve to release
+resources promptly:
+
+```julia
+PETScExt = Base.get_extension(LinearSolve, :LinearSolvePETScExt)
+
+sol = solve(prob, PETScAlgorithm(:gmres))
+PETScExt.cleanup_petsc_cache!(sol)
+
+# Or via the cache directly:
+cache = SciMLBase.init(prob, PETScAlgorithm(:cg))
+solve!(cache)
+PETScExt.cleanup_petsc_cache!(cache)
+```
+
+A GC finalizer is registered as a safety net, but explicit cleanup is
+strongly preferred for deterministic, timely resource release.
+
+---
 
 ## Example
 
 ```julia
-using PETSc, MPI, SparseArrays
-A = sprand(100, 100, 0.1) + 10I
-b = rand(100)
-prob = LinearProblem(A, b)
-alg = PETScAlgorithm(:gmres; pc_type = :ilu)
-sol = solve(prob, alg)
+using LinearSolve, PETSc, MPI, SparseArrays, LinearAlgebra
+
+MPI.Init()
+PETScExt = Base.get_extension(LinearSolve, :LinearSolvePETScExt)
+
+n = 100
+A = sprand(n, n, 0.1); A = A + A' + 20I
+b = rand(n)
+
+sol = solve(
+    LinearProblem(A, b),
+    PETScAlgorithm(:gmres; pc_type = :ilu, ksp_options = (ksp_monitor = "",))
+)
+println("Residual: ", norm(A * sol.u - b) / norm(b))
+PETScExt.cleanup_petsc_cache!(sol)
 ```
 """
 struct PETScAlgorithm <: SciMLLinearSolveAlgorithm
     solver_type::Symbol
     pc_type::Symbol
+    comm::Any             # MPI.Comm, stored as Any to avoid an MPI.jl dependency in LinearSolve
+    nullspace::Symbol     # :none | :constant | :custom
+    nullspace_vecs::Any   # nothing | Vector of AbstractVectors
+    prec_matrix::Any      # nothing | AbstractMatrix
+    initial_guess_nonzero::Bool
+    transposed::Bool
     ksp_options::NamedTuple
+
     function PETScAlgorithm(
             solver_type::Symbol = :gmres;
             pc_type::Symbol = :none,
-            ksp_options::NamedTuple = NamedTuple()
+            comm = nothing,
+            nullspace::Symbol = :none,
+            nullspace_vecs = nothing,
+            prec_matrix = nothing,
+            initial_guess_nonzero::Bool = false,
+            transposed::Bool = false,
+            ksp_options::NamedTuple = NamedTuple(),
+        )
+        Base.get_extension(@__MODULE__, :LinearSolvePETScExt) === nothing && error(
+            "PETScAlgorithm requires PETSc and MPI to be loaded: `using PETSc, MPI`"
+        )
+        nullspace ∈ (:none, :constant, :custom) || error(
+            "nullspace must be :none, :constant, or :custom (got :$nullspace)"
+        )
+        nullspace == :custom && nullspace_vecs === nothing && error(
+            "nullspace = :custom requires nullspace_vecs to be provided"
+        )
+        return new(
+            solver_type, pc_type, comm,
+            nullspace, nullspace_vecs,
+            prec_matrix,
+            initial_guess_nonzero, transposed,
+            ksp_options,
         )
-        ext = Base.get_extension(@__MODULE__, :LinearSolvePETScExt)
-        if ext === nothing
-            error("PETScAlgorithm requires that PETSc and MPI are loaded, i.e. `using PETSc, MPI`")
-        else
-            return new(solver_type, pc_type, ksp_options)
-        end
     end
 end
 
@@ -143,9 +221,9 @@ end
 """
 `CUDAOffload32MixedLUFactorization()`
 
-A mixed precision GPU-accelerated LU factorization that converts matrices to Float32 
+A mixed precision GPU-accelerated LU factorization that converts matrices to Float32
 before offloading to CUDA GPU for factorization, then converts back for the solve.
-This can provide speedups when the reduced precision is acceptable and memory 
+This can provide speedups when the reduced precision is acceptable and memory
 bandwidth is a bottleneck.
 
 ## Performance Notes
@@ -262,10 +340,10 @@ end
 """
     RFLUFactorization{P, T}(; pivot = Val(true), thread = Val(true))
 
-A fast pure Julia LU-factorization implementation using RecursiveFactorization.jl. 
-This is by far the fastest LU-factorization implementation, usually outperforming 
-OpenBLAS and MKL for smaller matrices (<500x500), but currently optimized only for 
-Base `Array` with `Float32` or `Float64`. Additional optimization for complex matrices 
+A fast pure Julia LU-factorization implementation using RecursiveFactorization.jl.
+This is by far the fastest LU-factorization implementation, usually outperforming
+OpenBLAS and MKL for smaller matrices (<500x500), but currently optimized only for
+Base `Array` with `Float32` or `Float64`. Additional optimization for complex matrices
 is in the works.
 
 ## Type Parameters
@@ -273,9 +351,9 @@ is in the works.
 - `T`: Threading strategy as `Val{Bool}`. `Val{true}` enables multi-threading for performance.
 
 ## Constructor Arguments
-- `pivot = Val(true)`: Enable partial pivoting. Set to `Val{false}` to disable for speed 
+- `pivot = Val(true)`: Enable partial pivoting. Set to `Val{false}` to disable for speed
   at the cost of numerical stability.
-- `thread = Val(true)`: Enable multi-threading. Set to `Val{false}` for single-threaded 
+- `thread = Val(true)`: Enable multi-threading. Set to `Val{false}` for single-threaded
   execution.
 - `throwerror = true`: Whether to throw an error if RecursiveFactorization.jl is not loaded.
 
@@ -294,7 +372,7 @@ using RecursiveFactorization
 # Fast, stable (with pivoting)
 alg1 = RFLUFactorization()
 # Fastest (no pivoting), less stable
-alg2 = RFLUFactorization(pivot=Val(false))  
+alg2 = RFLUFactorization(pivot=Val(false))
 ```
 """
 struct RFLUFactorization{P, T} <: AbstractDenseFactorization
@@ -316,7 +394,7 @@ end
 
 A fast pure Julia LU-factorization implementation
 using RecursiveFactorization.jl. This method utilizes a butterfly
-factorization approach rather than pivoting. 
+factorization approach rather than pivoting.
 """
 struct ButterflyFactorization{T} <: AbstractDenseFactorization
     thread::Val{T}
@@ -404,7 +482,7 @@ Using this solver requires that FastLapackInterface.jl is loaded: `using FastLap
 ```julia
 using FastLapackInterface
 # QR with column pivoting
-alg1 = FastQRFactorization()  
+alg1 = FastQRFactorization()
 # QR without pivoting for speed
 alg2 = FastQRFactorization(pivot=nothing)
 # Custom block size
@@ -800,8 +878,8 @@ function GinkgoJL_GMRES end
 """
     MetalLUFactorization()
 
-A wrapper over Apple's Metal GPU library for LU factorization. Direct calls to Metal 
-in a way that pre-allocates workspace to avoid allocations and automatically offloads 
+A wrapper over Apple's Metal GPU library for LU factorization. Direct calls to Metal
+in a way that pre-allocates workspace to avoid allocations and automatically offloads
 to the GPU. This solver is optimized for Metal-capable Apple Silicon Macs.
 
 ## Requirements
@@ -883,8 +961,8 @@ end
 """
     BLISLUFactorization()
 
-An LU factorization implementation using the BLIS (BLAS-like Library Instantiation Software) 
-framework. BLIS provides high-performance dense linear algebra kernels optimized for various 
+An LU factorization implementation using the BLIS (BLAS-like Library Instantiation Software)
+framework. BLIS provides high-performance dense linear algebra kernels optimized for various
 CPU architectures.
 
 ## Requirements
@@ -917,7 +995,7 @@ end
 `CUSOLVERRFFactorization(; symbolic = :RF, reuse_symbolic = true)`
 
 A GPU-accelerated sparse LU factorization using NVIDIA's cusolverRF library.
-This solver is specifically designed for sparse matrices on CUDA GPUs and 
+This solver is specifically designed for sparse matrices on CUDA GPUs and
 provides high-performance factorization and solve capabilities.
 
 ## Keyword Arguments
@@ -925,11 +1003,11 @@ provides high-performance factorization and solve capabilities.
   - `symbolic`: The symbolic factorization method to use. Options are:
     - `:RF` (default): Use cusolverRF's built-in symbolic analysis
     - `:KLU`: Use KLU for symbolic analysis
-  - `reuse_symbolic`: Whether to reuse the symbolic factorization when the 
+  - `reuse_symbolic`: Whether to reuse the symbolic factorization when the
     sparsity pattern doesn't change (default: `true`)
 
 !!! note
-    This solver requires CUSOLVERRF.jl to be loaded and only supports 
+    This solver requires CUSOLVERRF.jl to be loaded and only supports
     `Float64` element types with `Int32` indices.
 """
 struct CUSOLVERRFFactorization <: AbstractSparseFactorization
@@ -1021,7 +1099,7 @@ struct OpenBLAS32MixedLUFactorization <: AbstractDenseFactorization end
 """
     RF32MixedLUFactorization{P, T}(; pivot = Val(true), thread = Val(true))
 
-A mixed precision LU factorization using RecursiveFactorization.jl that performs 
+A mixed precision LU factorization using RecursiveFactorization.jl that performs
 factorization in Float32 precision while maintaining Float64 interface. This combines
 the speed benefits of RecursiveFactorization.jl with reduced precision computation
 for additional performance gains.
@@ -1031,9 +1109,9 @@ for additional performance gains.
 - `T`: Threading strategy as `Val{Bool}`. `Val{true}` enables multi-threading for performance.
 
 ## Constructor Arguments
-- `pivot = Val(true)`: Enable partial pivoting. Set to `Val{false}` to disable for speed 
+- `pivot = Val(true)`: Enable partial pivoting. Set to `Val{false}` to disable for speed
   at the cost of numerical stability.
-- `thread = Val(true)`: Enable multi-threading. Set to `Val{false}` for single-threaded 
+- `thread = Val(true)`: Enable multi-threading. Set to `Val{false}` for single-threaded
   execution.
 
 ## Performance Notes
diff --git a/test/petsctests.jl b/test/petsctests.jl
index 3dcd00e38..c1e82f2f5 100644
--- a/test/petsctests.jl
+++ b/test/petsctests.jl
@@ -3,88 +3,502 @@ using LinearSolve
 using MPI
 using PETSc
 using SparseArrays
+using Random
 using Test
 
 MPI.Init()
 
-# Get a PETSc library and initialize
-petsclib = first(PETSc.petsclibs)
-PETSc.initialized(petsclib) || PETSc.initialize(petsclib)
+const PETScExt = Base.get_extension(LinearSolve, :LinearSolvePETScExt)
+const rank = MPI.Comm_rank(MPI.COMM_WORLD)
+const nprocs = MPI.Comm_size(MPI.COMM_WORLD)
 
-@testset "PETScAlgorithm Basic Tests" begin
-    n = 100
+# ══════════════════════════════════════════════════════════════════════════════
+#  SERIAL TESTS  (comm = MPI.COMM_SELF, the default)
+# ══════════════════════════════════════════════════════════════════════════════
 
-    # Create a sparse positive definite matrix
+@testset "Serial: Basic Solvers" begin
+    n = 100
     A = sprand(n, n, 0.05) + 10I
-    A = A'A  # Make symmetric positive definite
+    A = A'A
     b = rand(n)
-
     prob = LinearProblem(A, b)
 
-    @testset "GMRES solver" begin
-        alg = PETScAlgorithm(:gmres)
-        sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+    @testset "GMRES" begin
+        sol = solve(prob, PETScAlgorithm(:gmres); abstol = 1.0e-10, reltol = 1.0e-10)
         @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(sol)
     end
 
-    @testset "CG solver" begin
-        alg = PETScAlgorithm(:cg)
-        sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+    @testset "CG" begin
+        sol = solve(prob, PETScAlgorithm(:cg); abstol = 1.0e-10, reltol = 1.0e-10)
         @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(sol)
     end
 
-    @testset "BiCGSTAB solver" begin
-        alg = PETScAlgorithm(:bcgs)
-        sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+    @testset "BiCGSTAB" begin
+        sol = solve(prob, PETScAlgorithm(:bcgs); abstol = 1.0e-10, reltol = 1.0e-10)
         @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(sol)
     end
 
-    @testset "GMRES with Jacobi preconditioner" begin
-        alg = PETScAlgorithm(:gmres; pc_type = :jacobi)
-        sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+    @testset "GMRES + Jacobi" begin
+        sol = solve(
+            prob, PETScAlgorithm(:gmres; pc_type = :jacobi);
+            abstol = 1.0e-10, reltol = 1.0e-10
+        )
         @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(sol)
     end
 
-    @testset "GMRES with ILU preconditioner" begin
-        alg = PETScAlgorithm(:gmres; pc_type = :ilu)
-        sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+    @testset "GMRES + ILU" begin
+        sol = solve(
+            prob, PETScAlgorithm(:gmres; pc_type = :ilu);
+            abstol = 1.0e-10, reltol = 1.0e-10
+        )
         @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(sol)
     end
 end
 
-@testset "PETScAlgorithm Dense Matrix" begin
+@testset "Serial: Dense Matrix" begin
     n = 50
-    A = rand(n, n) + 10I
-    A = A'A
+    A = rand(n, n) + 10I; A = A'A
     b = rand(n)
-
-    prob = LinearProblem(A, b)
-    alg = PETScAlgorithm(:gmres)
-    sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+    sol = solve(LinearProblem(A, b), PETScAlgorithm(:gmres); abstol = 1.0e-10)
     @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+    PETScExt.cleanup_petsc_cache!(sol)
 end
 
-@testset "PETScAlgorithm Cache Interface" begin
+@testset "Serial: Cache Interface" begin
     n = 50
-    A = sprand(n, n, 0.1) + 10I
-    A = A'A
+    A = sprand(n, n, 0.1) + 10I; A = A'A
     b1 = rand(n)
-
-    prob = LinearProblem(A, b1)
-    alg = PETScAlgorithm(:gmres)
-
-    # Initialize cache
-    cache = SciMLBase.init(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
-
-    # First solve
+    cache = SciMLBase.init(
+        LinearProblem(A, b1), PETScAlgorithm(:gmres);
+        abstol = 1.0e-10, reltol = 1.0e-10
+    )
     sol1 = solve!(cache)
     @test norm(A * sol1.u - b1) / norm(b1) < 1.0e-6
 
-    # Update b and solve again
     b2 = rand(n)
     cache.b = b2
     sol2 = solve!(cache)
     @test norm(A * sol2.u - b2) / norm(b2) < 1.0e-6
+    PETScExt.cleanup_petsc_cache!(cache)
+end
+
+@testset "Serial: Nullspace Constant" begin
+    n = 100
+    D = sparse(1:n, 1:n, 2.0, n, n)
+    D -= sparse(1:(n - 1), 2:n, 1.0, n, n)
+    D -= sparse(2:n, 1:(n - 1), 1.0, n, n)
+    D[1, 1] = 1.0; D[end, end] = 1.0
+    b = rand(n); b .-= sum(b) / n
+
+    alg = PETScAlgorithm(:cg; pc_type = :jacobi, nullspace = :constant)
+    sol = solve(LinearProblem(D, b), alg; abstol = 1.0e-10)
+    @test sol.retcode == SciMLBase.ReturnCode.Success
+    @test norm(D * sol.u - b) / norm(b) < 1.0e-6
+    PETScExt.cleanup_petsc_cache!(sol)
+end
+
+@testset "Serial: Nullspace Custom" begin
+    n = 50
+    A = spdiagm(0 => 2 * ones(n), 1 => -ones(n - 1), -1 => -ones(n - 1))
+    b = rand(n); b .-= sum(b) / n
+
+    alg = PETScAlgorithm(:gmres; pc_type = :ilu, nullspace = :custom, nullspace_vecs = [ones(n)])
+    sol = solve(LinearProblem(A, b), alg; abstol = 1.0e-10)
+    @test sol.retcode == SciMLBase.ReturnCode.Success
+    @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+end
+
+@testset "Serial: Transposed Solve" begin
+    n = 50
+    A = sprand(n, n, 0.2) + 5I; b = rand(n)
+    sol = solve(LinearProblem(A, b), PETScAlgorithm(:gmres; transposed = true))
+    @test norm(A' * sol.u - b) / norm(b) < 1.0e-6
+    PETScExt.cleanup_petsc_cache!(sol)
+end
+
+@testset "Serial: Warm Start" begin
+    n = 200
+    A = sprand(n, n, 0.02) + 10I; A = A'A; b = rand(n)
+    prob = LinearProblem(A, b)
+
+    sol_cold = solve(
+        prob, PETScAlgorithm(:cg; initial_guess_nonzero = false);
+        reltol = 1.0e-12
+    )
+    iters_cold = sol_cold.iters
+    PETScExt.cleanup_petsc_cache!(sol_cold)
+
+    cache_warm = SciMLBase.init(
+        prob, PETScAlgorithm(:cg; initial_guess_nonzero = true); reltol = 1.0e-12
+    )
+    solve!(cache_warm)
+    cache_warm.b = b + rand(n) * 0.01
+    sol2 = solve!(cache_warm)
+    @test sol2.iters < iters_cold
+    PETScExt.cleanup_petsc_cache!(cache_warm)
+end
+
+@testset "Serial: Scalar Types" begin
+    n = 50
+
+    @testset "Float64 sparse" begin
+        A = sprand(Float64, n, n, 0.1) + 10I; A = A'A; b = rand(Float64, n)
+        sol = solve(LinearProblem(A, b), PETScAlgorithm(:cg); abstol = 1.0e-10)
+        @test eltype(sol.u) == Float64
+        @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(sol)
+    end
+
+    @testset "Float32 sparse" begin
+        A32 = sparse(Float32.(Matrix(sprand(Float64, n, n, 0.1) + 10I)))
+        A32 = A32'A32; b32 = rand(Float32, n)
+        sol32 = solve(LinearProblem(A32, b32), PETScAlgorithm(:cg); abstol = 1.0f-5)
+        @test eltype(sol32.u) == Float32
+        @test norm(A32 * sol32.u - b32) / norm(b32) < 1.0f-3
+        PETScExt.cleanup_petsc_cache!(sol32)
+    end
+
+    @testset "ComplexF64 sparse" begin
+        B = sprand(ComplexF64, n, n, 0.1)
+        Ac = B' * B + 10I
+        bc = rand(ComplexF64, n)
+        sol = solve(LinearProblem(Ac, bc), PETScAlgorithm(:gmres); abstol = 1.0e-10)
+        @test eltype(sol.u) == ComplexF64
+        @test norm(Ac * sol.u - bc) / norm(bc) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(sol)
+    end
+
+    @testset "ComplexF32 sparse" begin
+        B32 = sparse(ComplexF32.(Matrix(sprand(ComplexF64, n, n, 0.1))))
+        Ac32 = B32' * B32 + 10I
+        bc32 = rand(ComplexF32, n)
+        sol32 = solve(LinearProblem(Ac32, bc32), PETScAlgorithm(:gmres); abstol = 1.0f-5)
+        @test eltype(sol32.u) == ComplexF32
+        @test norm(Ac32 * sol32.u - bc32) / norm(bc32) < 1.0f-3
+        PETScExt.cleanup_petsc_cache!(sol32)
+    end
+
+    @testset "Unsupported type errors" begin
+        A_big = BigFloat.(Matrix(sprand(n, n, 0.1) + 10I))
+        b_big = BigFloat.(rand(n))
+        @test_throws Exception solve(
+            LinearProblem(sparse(A_big), b_big), PETScAlgorithm(:gmres)
+        )
+    end
+end
+
+@testset "PETSc Options Database" begin
+    n = 100
+    A = sprand(n, n, 0.1); A = A + A' + 20I
+    b = rand(n)
+    prob = LinearProblem(A, b)
+
+    @testset "High-precision tolerance" begin
+        alg = PETScAlgorithm(:gmres; ksp_options = (ksp_rtol = 1.0e-14, ksp_atol = 1.0e-14))
+        sol = solve(prob, alg)
+        @test norm(A * sol.u - b) / norm(b) < 1.0e-13
+        PETScExt.cleanup_petsc_cache!(sol)
+    end
+
+    @testset "Monitoring flags (no crash)" begin
+        alg = PETScAlgorithm(:cg; ksp_options = (ksp_monitor = "", ksp_view = ""))
+        sol = solve(prob, alg)
+        @test sol.retcode == SciMLBase.ReturnCode.Success
+        PETScExt.cleanup_petsc_cache!(sol)
+    end
+
+    @testset "KSP type override via options" begin
+        alg = PETScAlgorithm(:gmres; ksp_options = (ksp_type = "cg",))
+        sol = solve(prob, alg)
+        @test sol.retcode == SciMLBase.ReturnCode.Success
+        PETScExt.cleanup_petsc_cache!(sol)
+    end
+end
+
+@testset "Serial: Cleanup" begin
+    n = 50
+    A = sprand(n, n, 0.1) + 10I; A = A'A; b = rand(n)
+    prob = LinearProblem(A, b)
+    alg = PETScAlgorithm(:gmres; pc_type = :jacobi)
+
+    @testset "via solution" begin
+        sol = solve(prob, alg)
+        pcache = sol.cache.cacheval
+        @test pcache.ksp !== nothing
+        PETScExt.cleanup_petsc_cache!(sol)
+        @test pcache.ksp === nothing
+    end
+
+    @testset "via cache" begin
+        cache = SciMLBase.init(prob, alg); solve!(cache)
+        PETScExt.cleanup_petsc_cache!(cache)
+        @test cache.cacheval.ksp === nothing
+    end
+
+    @testset "idempotent" begin
+        sol = solve(prob, alg)
+        PETScExt.cleanup_petsc_cache!(sol)
+        PETScExt.cleanup_petsc_cache!(sol)
+        @test sol.cache.cacheval.ksp === nothing
+    end
+
+    @testset "empty cache (before solve)" begin
+        ce = SciMLBase.init(prob, alg)
+        PETScExt.cleanup_petsc_cache!(ce)
+        @test ce.cacheval.ksp === nothing
+    end
 end
 
-PETSc.finalize(petsclib)
+# ══════════════════════════════════════════════════════════════════════════════
+#  MATRIX REBUILD LOGIC TESTS
+#  Exercises the three cases in solve!:
+#    Case 1 — first solve (no KSP yet)
+#    Case 2 — structure changed → full KSP rebuild
+#    Case 3 — same structure, values only → in-place update, KSP reused
+# ══════════════════════════════════════════════════════════════════════════════
+@testset "Serial: Matrix Rebuild Logic" begin
+    n = 50
+    Random.seed!(42)
+
+    # Helper: SPD sparse matrix with a fixed sparsity pattern, variable values.
+    base = sprand(n, n, 0.1); base = base + base' + 10I
+    pattern_I, pattern_J, _ = findnz(base)
+
+    function make_spd_with_pattern(scale)
+        vals = abs.(randn(length(pattern_I))) .* scale
+        A = sparse(pattern_I, pattern_J, vals, n, n)
+        A = A + A'
+        d = vec(sum(abs.(A), dims = 2)) .+ scale
+        for i in 1:n
+            A[i, i] = d[i]
+        end
+        return A
+    end
+
+    @testset "Case 1: first solve builds KSP" begin
+        A = make_spd_with_pattern(10.0)
+        b = rand(n)
+        cache = SciMLBase.init(
+            LinearProblem(A, b), PETScAlgorithm(:cg; pc_type = :jacobi); abstol = 1.0e-10
+        )
+        @test cache.cacheval.ksp === nothing
+        sol = solve!(cache)
+        @test cache.cacheval.ksp !== nothing
+        @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(cache)
+    end
+
+    @testset "Case 3: same sparsity — KSP reused, values updated" begin
+        A1 = make_spd_with_pattern(10.0)
+        b1 = rand(n)
+        cache = SciMLBase.init(
+            LinearProblem(A1, b1), PETScAlgorithm(:cg; pc_type = :jacobi); abstol = 1.0e-10
+        )
+        solve!(cache)
+        ksp_before = cache.cacheval.ksp
+
+        A2 = make_spd_with_pattern(20.0)
+        b2 = rand(n)
+        SciMLBase.reinit!(cache; A = A2, b = b2)
+        sol2 = solve!(cache)
+
+        @test cache.cacheval.ksp === ksp_before
+        @test norm(A2 * sol2.u - b2) / norm(b2) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(cache)
+    end
+
+    @testset "Case 2: sparsity changed — KSP rebuilt" begin
+        A1 = make_spd_with_pattern(10.0)
+        b1 = rand(n)
+        cache = SciMLBase.init(
+            LinearProblem(A1, b1), PETScAlgorithm(:cg; pc_type = :jacobi); abstol = 1.0e-10
+        )
+        solve!(cache)
+        ksp_before = cache.cacheval.ksp
+
+        A3 = sprand(n, n, 0.2) + 15I; A3 = A3 + A3'
+        b3 = rand(n)
+        SciMLBase.reinit!(cache; A = A3, b = b3)
+        sol3 = solve!(cache)
+
+        @test cache.cacheval.ksp !== ksp_before
+        @test norm(A3 * sol3.u - b3) / norm(b3) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(cache)
+    end
+
+    @testset "Case 3: correctness over multiple value-only updates" begin
+        A = make_spd_with_pattern(10.0)
+        b = rand(n)
+        cache = SciMLBase.init(
+            LinearProblem(A, b), PETScAlgorithm(:cg; pc_type = :jacobi); abstol = 1.0e-10
+        )
+        solve!(cache)
+        for _ in 1:5
+            A_new = make_spd_with_pattern(10.0 + rand())
+            b_new = rand(n)
+            SciMLBase.reinit!(cache; A = A_new, b = b_new)
+            sol = solve!(cache)
+            @test norm(A_new * sol.u - b_new) / norm(b_new) < 1.0e-6
+        end
+        PETScExt.cleanup_petsc_cache!(cache)
+    end
+
+    @testset "Case 3: same-size dense — KSP reused" begin
+        A1 = Matrix(make_spd_with_pattern(10.0))
+        b1 = rand(n)
+        cache = SciMLBase.init(
+            LinearProblem(A1, b1), PETScAlgorithm(:gmres); abstol = 1.0e-10
+        )
+        sol1 = solve!(cache)
+        @test norm(A1 * sol1.u - b1) / norm(b1) < 1.0e-6
+
+        ksp_before = cache.cacheval.ksp
+        A2 = Matrix(make_spd_with_pattern(20.0))
+        b2 = rand(n)
+        SciMLBase.reinit!(cache; A = A2, b = b2)
+        sol2 = solve!(cache)
+
+        @test cache.cacheval.ksp === ksp_before
+        @test norm(A2 * sol2.u - b2) / norm(b2) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(cache)
+    end
+
+    @testset "Case 2: size change dense — KSP rebuilt" begin
+        n2 = n + 5
+        A1 = Matrix(make_spd_with_pattern(10.0))
+        b1 = rand(n)
+        cache = SciMLBase.init(
+            LinearProblem(A1, b1), PETScAlgorithm(:gmres); abstol = 1.0e-10
+        )
+        sol1 = solve!(cache)
+        @test norm(A1 * sol1.u - b1) / norm(b1) < 1.0e-6
+
+        ksp_before = cache.cacheval.ksp
+        A2 = rand(n2, n2); A2 = A2'A2 + n2 * I
+        b2 = rand(n2)
+        SciMLBase.reinit!(cache; A = A2, b = b2, u = zeros(n2))
+        sol2 = solve!(cache)
+
+        @test cache.cacheval.ksp !== ksp_before
+        @test norm(A2 * sol2.u - b2) / norm(b2) < 1.0e-6
+        PETScExt.cleanup_petsc_cache!(cache)
+    end
+end
+
+# ══════════════════════════════════════════════════════════════════════════════
+#  NO-COPY / MEMORY SAFETY TESTS
+#  Verify that Julia backing arrays are never unexpectedly mutated by PETSc.
+# ══════════════════════════════════════════════════════════════════════════════
+
+@testset "No-copy verification" begin
+    n = 50
+    function make_spd_sparse(n)
+        A = sprand(n, n, 0.2) + 10I; return A + A'
+    end
+    function make_spd_dense(n)
+        B = randn(n, n); return B'B + 10I
+    end
+
+    @testset "Sparse operator: backing arrays not copied" begin
+        A = make_spd_sparse(n)
+        b = rand(n)
+        ptr_nzval = pointer(A.nzval)
+        ptr_colptr = pointer(A.colptr)
+        sol = solve(
+            LinearProblem(A, b), PETScAlgorithm(:gmres; pc_type = :ilu);
+            abstol = 1.0e-10
+        )
+        @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        @test pointer(A.nzval) === ptr_nzval
+        @test pointer(A.colptr) === ptr_colptr
+        PETScExt.cleanup_petsc_cache!(sol)
+    end
+
+    @testset "Dense operator: Julia array not copied" begin
+        A = make_spd_dense(n)
+        b = rand(n)
+        ptr_before = pointer(A)
+        sol = solve(
+            LinearProblem(A, b), PETScAlgorithm(:gmres; pc_type = :jacobi);
+            abstol = 1.0e-10
+        )
+        @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+        @test pointer(A) === ptr_before
+        PETScExt.cleanup_petsc_cache!(sol)
+    end
+
+    @testset "Sparse Case 3: nzval pointer stable across value-only updates" begin
+        base = make_spd_sparse(n)
+        I_idx, J_idx, _ = findnz(base)
+        function make_with_pattern(scale)
+            vals = abs.(randn(length(I_idx))) .* scale
+            A = sparse(I_idx, J_idx, vals, n, n); A = A + A'
+            d = vec(sum(abs.(A), dims = 2)) .+ scale
+            for i in 1:n
+                A[i, i] = d[i]
+            end
+            return A
+        end
+
+        A1 = make_with_pattern(10.0)
+        cache = SciMLBase.init(
+            LinearProblem(A1, rand(n)), PETScAlgorithm(:cg; pc_type = :jacobi); abstol = 1.0e-10
+        )
+        solve!(cache)
+
+        A2 = make_with_pattern(20.0)
+        ptr_nzval = pointer(A2.nzval)
+        ptr_colptr = pointer(A2.colptr)
+        b2 = rand(n)
+        SciMLBase.reinit!(cache; A = A2, b = b2)
+        sol = solve!(cache)
+
+        @test norm(A2 * sol.u - b2) / norm(b2) < 1.0e-6
+        @test pointer(A2.nzval) === ptr_nzval
+        @test pointer(A2.colptr) === ptr_colptr
+        PETScExt.cleanup_petsc_cache!(cache)
+    end
+
+    @testset "Dense explicit prec_matrix: not mutated by LU/Cholesky" begin
+        # With a separate prec_matrix, PETSc factors its own internal copy.
+        # The user's P array must be unchanged after one or more solves.
+        A = make_spd_dense(n)
+        P = copy(A)
+        P_snapshot = copy(P)
+        alg = PETScAlgorithm(:preonly; pc_type = :cholesky, prec_matrix = P)
+
+        b1 = rand(n)
+        cache = SciMLBase.init(LinearProblem(A, b1), alg; abstol = 1.0e-10)
+        sol1 = solve!(cache)
+        @test norm(A * sol1.u - b1) / norm(b1) < 1.0e-6
+        @test P == P_snapshot   # first solve must not mutate P
+
+        b2 = rand(n)
+        SciMLBase.reinit!(cache; b = b2)
+        sol2 = solve!(cache)
+        @test norm(A * sol2.u - b2) / norm(b2) < 1.0e-6
+        @test P == P_snapshot   # second solve must not mutate P either
+        PETScExt.cleanup_petsc_cache!(cache)
+    end
+
+    @testset "Dense operator with factorising PC (no prec_matrix): documented mutation" begin
+        # Without a separate prec_matrix, petsc_P aliases petsc_A.
+        # An in-place factorisation (LU/Cholesky) will overwrite the Julia array.
+        # This is documented behaviour — the solve is still correct.
+        A = make_spd_dense(n)
+        A_snapshot = copy(A)
+        b = rand(n)
+        sol = solve(
+            LinearProblem(A, b), PETScAlgorithm(:preonly; pc_type = :lu);
+            abstol = 1.0e-10
+        )
+        @test norm(A_snapshot * sol.u - b) / norm(b) < 1.0e-6
+        # A may have been overwritten by LU — that's expected, no assertion on A's values
+        PETScExt.cleanup_petsc_cache!(sol)
+    end
+end
diff --git a/test/petsctests_mpi.jl b/test/petsctests_mpi.jl
new file mode 100644
index 000000000..ac1b52cb6
--- /dev/null
+++ b/test/petsctests_mpi.jl
@@ -0,0 +1,114 @@
+using LinearAlgebra
+using LinearSolve
+using MPI
+using PETSc
+using SparseArrays
+using Random
+using Test
+
+Random.seed!(1234)
+
+MPI.Init()
+
+const PETScExt = Base.get_extension(LinearSolve, :LinearSolvePETScExt)
+const rank = MPI.Comm_rank(MPI.COMM_WORLD)
+const comm_size = MPI.Comm_size(MPI.COMM_WORLD)
+
+# ══════════════════════════════════════════════════════════════════════════════
+#  MPI-PARALLEL TESTS  (comm = MPI.COMM_WORLD)
+#
+#  Run with: mpiexecjl -n 2 julia test/petsctests_mpi.jl
+#  (or -n 4, etc.)
+#
+#  Phase 1 strategy: every rank holds the full Julia matrix; PETSc distributes
+#  the rows across ranks for the solve.
+# ══════════════════════════════════════════════════════════════════════════════
+
+
+@testset "MPI (comm_size=$comm_size): Ownership Distribution" begin
+    n = 100
+    A = sprand(n, n, 0.1) + 10I
+    alg = PETScAlgorithm(:gmres; comm = MPI.COMM_WORLD)
+    cache = SciMLBase.init(LinearProblem(A, rand(n)), alg)
+    solve!(cache)
+
+    pcache = cache.cacheval
+    @test pcache.rend - pcache.rstart <= (n ÷ comm_size) + 1
+
+    total_rows = MPI.Allreduce(pcache.rend - pcache.rstart, MPI.SUM, MPI.COMM_WORLD)
+    @test total_rows == n
+
+    PETScExt.cleanup_petsc_cache!(cache)
+end
+
+
+@testset "MPI (comm_size=$comm_size): GMRES + Jacobi" begin
+    n = 200
+    A = sprand(n, n, 0.05) + 10I; A = A'A
+    b = rand(n)
+    prob = LinearProblem(A, b)
+    alg = PETScAlgorithm(:gmres; pc_type = :jacobi, comm = MPI.COMM_WORLD)
+
+    sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+
+    # solve! now auto-gathers — sol.u is the full solution on all ranks
+    if rank == 0
+        @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+    end
+
+    PETScExt.cleanup_petsc_cache!(sol)
+end
+
+@testset "MPI (comm_size=$comm_size): CG" begin
+    n = 200
+    A = sprand(n, n, 0.05) + 10I; A = A'A; b = rand(n)
+    prob = LinearProblem(A, b)
+    alg = PETScAlgorithm(:cg; pc_type = :jacobi, comm = MPI.COMM_WORLD)
+    sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+
+    if rank == 0
+        @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+    end
+
+    PETScExt.cleanup_petsc_cache!(sol)
+end
+
+@testset "MPI (comm_size=$comm_size): Dense Matrix" begin
+    n = 80
+    A = rand(n, n) + 10I; A = A'A; b = rand(n)
+    prob = LinearProblem(A, b)
+    alg = PETScAlgorithm(:gmres; pc_type = :jacobi, comm = MPI.COMM_WORLD)
+    sol = solve(prob, alg; abstol = 1.0e-10, reltol = 1.0e-10)
+
+    if rank == 0
+        @test norm(A * sol.u - b) / norm(b) < 1.0e-6
+    end
+    PETScExt.cleanup_petsc_cache!(sol)
+end
+
+@testset "MPI (comm_size=$comm_size): Cache Interface" begin
+    n = 100
+    A = sprand(n, n, 0.1) + 10I; A = A'A
+    b1 = rand(n); b2 = rand(n)
+    alg = PETScAlgorithm(:gmres; pc_type = :jacobi, comm = MPI.COMM_WORLD)
+
+    cache = SciMLBase.init(LinearProblem(A, b1), alg; abstol = 1.0e-10)
+    sol1 = solve!(cache)
+
+    # No manual gather needed — solve! auto-gathers the full solution
+    if rank == 0
+        @test norm(A * sol1.u - b1) / norm(b1) < 1.0e-6
+    end
+
+    # Update b and re-solve
+    cache.b .= b2
+    sol2 = solve!(cache)
+
+    if rank == 0
+        @test norm(A * sol2.u - b2) / norm(b2) < 1.0e-6
+    end
+
+    PETScExt.cleanup_petsc_cache!(cache)
+    PETScExt.cleanup_petsc_cache!(sol1)
+    PETScExt.cleanup_petsc_cache!(sol2)
+end