jbphyswx
diff --git a/‎ext/StructureFunctionsDistributedExt.jl‎
Lines changed: 68 additions & 0 deletions b/‎ext/StructureFunctionsDistributedExt.jl‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎ext/StructureFunctionsGPUExt.jl‎
Lines changed: 105 additions & 0 deletions b/‎ext/StructureFunctionsGPUExt.jl‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎ext/StructureFunctionsOhMyThreadsExt.jl‎
Lines changed: 90 additions & 1 deletion b/‎ext/StructureFunctionsOhMyThreadsExt.jl‎
Lines changed: 90 additions & 1 deletion
@@ -163,4 +163,72 @@ function SFC.parallel_calculate_structure_function(
     )
 end
 
+"""
+    SFC._dispatch_single_pass(::DistributedBackend, x, u, distance_bins; kwargs...)
+
+Calculates single-pass structure functions utilizing multi-process Distributed execution.
+Uses a process-parallelized loop via `@distributed (+)` and reduces results element-wise.
+"""
+function SFC._dispatch_single_pass(
+    ::SF.DistributedBackend,
+    x::AbstractMatrix{FT1},
+    u::AbstractMatrix{FT2},
+    distance_bins::AbstractVector{FT3};
+    kwargs...
+) where {FT1 <: Number, FT2 <: Number, FT3 <: Number}
+    OT = promote_type(float(FT1), float(FT2))
+    n_bins = length(distance_bins) - 1
+    n_points = size(x, 2)
+    
+    # We distribute the outer points loop.
+    # The reduction operator `+` works element-wise on the returned Float64 matrix of shape (16, n_bins).
+    combined_reduced = Distributed.@distributed (+) for i in 1:n_points
+        local_combined = zeros(Float64, 16, n_bins)
+        x_i = SA.SVector{2, FT1}(x[1, i], x[2, i])
+        u_i = SA.SVector{2, FT2}(u[1, i], u[2, i])
+        
+        for j in (i+1):n_points
+            x_j = SA.SVector{2, FT1}(x[1, j], x[2, j])
+            
+            dx = SFH.δr(x_i, x_j)
+            r = LA.norm(dx)
+            
+            bin_idx = SFH.digitize(r, distance_bins)
+            
+            if 1 <= bin_idx <= n_bins
+                u_j = SA.SVector{2, FT2}(u[1, j], u[2, j])
+                du = u_j - u_i
+                
+                rh = SFH.r̂(x_i, x_j)
+                nh = SFH.n̂(rh)
+                
+                du_L = LA.dot(du, rh)
+                du_T = LA.dot(du, nh)
+                
+                du_L2 = du_L * du_L
+                du_T2 = du_T * du_T
+                
+                local_combined[1, bin_idx] += du_L2 + du_T2
+                local_combined[2, bin_idx] += du_L2
+                local_combined[3, bin_idx] += du_T2
+                local_combined[4, bin_idx] += du_L * (du_L2 + du_T2)
+                local_combined[5, bin_idx] += du_L * du_L2
+                local_combined[6, bin_idx] += du_L2 * du_T
+                local_combined[7, bin_idx] += du_L * du_T2
+                local_combined[8, bin_idx] += du_T * du_T2
+                
+                for t in 9:16
+                    local_combined[t, bin_idx] += 1.0
+                end
+            end
+        end
+        local_combined
+    end
+    
+    sums = OT.(combined_reduced[1:8, :])
+    counts = Int64.(combined_reduced[9:16, :])
+    
+    return SFC.postprocess_single_pass_results(sums, counts, distance_bins)
+end
+
 end
@@ -453,5 +453,110 @@ function SFSA.gpu_calculate_spectrum(
     return Array(coeffs_dev), ks_phys
 end
 
+# ---------------------------------------------------------------------------
+# Single-Pass GPU Kernel
+# ---------------------------------------------------------------------------
+
+KA.@kernel function _sf_single_pass_kernel!(
+    output_sums,                 # Matrix{FT} of size (8, N_bins-1)
+    output_counts,               # Matrix{FT} of size (8, N_bins-1)
+    @Const(x_mat),               # Matrix{FT} of size (2, N_points)
+    @Const(u_mat),               # Matrix{FT} of size (2, N_points)
+    @Const(distance_bins),       # monotone bin edges, length N_bins
+    N_points::Int,
+    N_bins::Int,
+)
+    I = @index(Global, NTuple)
+    i = I[1]
+    j = I[2]
+    
+    if i < j
+        # Static arrays on stack (using 2D coordinates)
+        X1 = SA.SVector{2}(x_mat[1, i], x_mat[2, i])
+        X2 = SA.SVector{2}(x_mat[1, j], x_mat[2, j])
+        U1 = SA.SVector{2}(u_mat[1, i], u_mat[2, i])
+        U2 = SA.SVector{2}(u_mat[1, j], u_mat[2, j])
+        
+        dX = X2 - X1
+        dist = sqrt(dX[1]^2 + dX[2]^2)
+        
+        bin = SFH.digitize(dist, distance_bins)
+        
+        if 1 <= bin < N_bins
+            dU = U2 - U1
+            r̂ = dX / dist
+            n̂ = SA.SVector{2, eltype(x_mat)}(r̂[2], -r̂[1])
+            
+            du_L = SA.dot(dU, r̂)
+            du_T = SA.dot(dU, n̂)
+            
+            du_L2 = du_L * du_L
+            du_T2 = du_T * du_T
+            
+            # Atomically accumulate the 8 structure functions
+            @atomic output_sums[1, bin] += du_L2 + du_T2
+            @atomic output_sums[2, bin] += du_L2
+            @atomic output_sums[3, bin] += du_T2
+            @atomic output_sums[4, bin] += du_L * (du_L2 + du_T2)
+            @atomic output_sums[5, bin] += du_L * du_L2
+            @atomic output_sums[6, bin] += du_L2 * du_T
+            @atomic output_sums[7, bin] += du_L * du_T2
+            @atomic output_sums[8, bin] += du_T * du_T2
+            
+            for t in 1:8
+                @atomic output_counts[t, bin] += one(eltype(output_counts))
+            end
+        end
+    end
+end
+
+"""
+    SFC._dispatch_single_pass(::GPUBackend, x, u, distance_bins; workgroup_size=64, kwargs...)
+
+Calculates single-pass structure functions utilizing GPU-accelerated computing.
+"""
+function SFC._dispatch_single_pass(
+    gpu_backend::SF.GPUBackend,
+    x::AbstractMatrix{FT1},
+    u::AbstractMatrix{FT2},
+    distance_bins::AbstractVector{FT3};
+    workgroup_size::Int = 64,
+    kwargs...
+) where {FT1 <: Number, FT2 <: Number, FT3 <: Number}
+    backend = gpu_backend.backend
+    FT = promote_type(float(FT1), float(FT2))
+    N_dims, N_points = size(x)
+    n_bins = length(distance_bins) - 1
+    
+    if N_dims != 2
+        error("GPUExt: single-pass calculation only supports 2D coordinates (got N_dims=$N_dims)")
+    end
+    
+    # Allocate device arrays
+    x_dev = KA.allocate(backend, FT, 2, N_points)
+    u_dev = KA.allocate(backend, FT, 2, N_points)
+    bins_dev = KA.allocate(backend, FT, length(distance_bins))
+    out_sums_dev = KA.zeros(backend, FT, 8, n_bins)
+    out_cnts_dev = KA.zeros(backend, FT, 8, n_bins)
+    
+    copyto!(x_dev, collect(x))
+    copyto!(u_dev, collect(u))
+    copyto!(bins_dev, collect(distance_bins))
+    
+    kernel! = _sf_single_pass_kernel!(backend, workgroup_size)
+    kernel!(
+        out_sums_dev, out_cnts_dev,
+        x_dev, u_dev,
+        bins_dev,
+        N_points, length(distance_bins);
+        ndrange = (N_points, N_points)
+    )
+    KA.synchronize(backend)
+    
+    sums = Array(out_sums_dev)
+    counts = Int64.(Array(out_cnts_dev))
+    
+    return SFC.postprocess_single_pass_results(sums, counts, distance_bins)
+end
 
 end # module GPUExt
@@ -2,11 +2,14 @@ module StructureFunctionsOhMyThreadsExt
 
 using Distances: Distances as DI
 using OhMyThreads: OhMyThreads as OMT
+using StaticArrays: StaticArrays as SA
+using LinearAlgebra: LinearAlgebra as LA
 using StructureFunctions:
     StructureFunctions as SF,
     Calculations as SFC,
     StructureFunctionObjects as SFO,
-    StructureFunctionTypes as SFT
+    StructureFunctionTypes as SFT,
+    HelperFunctions as SFH
 
 """
     SFC.threaded_calculate_structure_function(sf_type, x_vecs, u_vecs, distance_bins, ::Val{RSAC}; ...)
@@ -174,4 +177,90 @@ function SFC.threaded_calculate_structure_function(
     return SF.StructureFunction(structure_function_type, distance_bins, output_div)
 end
 
+"""
+    SFC._dispatch_single_pass(::ThreadedBackend, x, u, distance_bins; thread_sums=nothing, thread_counts=nothing)
+
+Calculates single-pass structure functions utilizing multi-threaded CPU execution with OhMyThreads.jl.
+Chunks the loop over points to balance work across tasks, utilizing a pre-allocated thread-local
+reduction matrix to prevent memory allocation and write hazards.
+"""
+function SFC._dispatch_single_pass(
+    ::SF.ThreadedBackend,
+    x::AbstractMatrix{FT1},
+    u::AbstractMatrix{FT2},
+    distance_bins::AbstractVector{FT3};
+    thread_sums = nothing,
+    thread_counts = nothing,
+    kwargs...
+) where {FT1 <: Number, FT2 <: Number, FT3 <: Number}
+    OT = promote_type(float(FT1), float(FT2))
+    n_bins = length(distance_bins) - 1
+    n_threads = Threads.nthreads()
+    
+    # Check/allocate thread-local reduction heaps
+    ts = isnothing(thread_sums) ? zeros(OT, 8, n_bins, n_threads) : thread_sums
+    tc = isnothing(thread_counts) ? zeros(Int64, 8, n_bins, n_threads) : thread_counts
+    
+    fill!(ts, zero(OT))
+    fill!(tc, 0)
+    
+    n_points = size(x, 2)
+    
+    OMT.tforeach(1:n_points) do i
+        tid = Threads.threadid()
+        x_i = SA.SVector{2, FT1}(x[1, i], x[2, i])
+        u_i = SA.SVector{2, FT2}(u[1, i], u[2, i])
+        
+        for j in (i+1):n_points
+            x_j = SA.SVector{2, FT1}(x[1, j], x[2, j])
+            
+            dx = SFH.δr(x_i, x_j)
+            r = LA.norm(dx)
+            
+            bin_idx = SFH.digitize(r, distance_bins)
+            
+            if 1 <= bin_idx <= n_bins
+                u_j = SA.SVector{2, FT2}(u[1, j], u[2, j])
+                du = u_j - u_i
+                
+                rh = SFH.r̂(x_i, x_j)
+                nh = SFH.n̂(rh)
+                
+                du_L = LA.dot(du, rh)
+                du_T = LA.dot(du, nh)
+                
+                du_L2 = du_L * du_L
+                du_T2 = du_T * du_T
+                
+                @inbounds ts[1, bin_idx, tid] += du_L2 + du_T2
+                @inbounds ts[2, bin_idx, tid] += du_L2
+                @inbounds ts[3, bin_idx, tid] += du_T2
+                @inbounds ts[4, bin_idx, tid] += du_L * (du_L2 + du_T2)
+                @inbounds ts[5, bin_idx, tid] += du_L * du_L2
+                @inbounds ts[6, bin_idx, tid] += du_L2 * du_T
+                @inbounds ts[7, bin_idx, tid] += du_L * du_T2
+                @inbounds ts[8, bin_idx, tid] += du_T * du_T2
+                
+                for t in 1:8
+                    @inbounds tc[t, bin_idx, tid] += 1
+                end
+            end
+        end
+    end
+    
+    # Reduce thread-local slices
+    sums = zeros(OT, 8, n_bins)
+    counts = zeros(Int64, 8, n_bins)
+    for tid in 1:n_threads
+        for k in 1:n_bins
+            for t in 1:8
+                sums[t, k] += ts[t, k, tid]
+                counts[t, k] += tc[t, k, tid]
+            end
+        end
+    end
+    
+    return SFC.postprocess_single_pass_results(sums, counts, distance_bins)
+end
+
 end