@@ -453,5 +453,110 @@ function SFSA.gpu_calculate_spectrum(
453453 return Array (coeffs_dev), ks_phys
454454end
455455
456+ # ---------------------------------------------------------------------------
457+ # Single-Pass GPU Kernel
458+ # ---------------------------------------------------------------------------
459+
460+ KA. @kernel function _sf_single_pass_kernel! (
461+ output_sums, # Matrix{FT} of size (8, N_bins-1)
462+ output_counts, # Matrix{FT} of size (8, N_bins-1)
463+ @Const (x_mat), # Matrix{FT} of size (2, N_points)
464+ @Const (u_mat), # Matrix{FT} of size (2, N_points)
465+ @Const (distance_bins), # monotone bin edges, length N_bins
466+ N_points:: Int ,
467+ N_bins:: Int ,
468+ )
469+ I = @index (Global, NTuple)
470+ i = I[1 ]
471+ j = I[2 ]
472+
473+ if i < j
474+ # Static arrays on stack (using 2D coordinates)
475+ X1 = SA. SVector {2} (x_mat[1 , i], x_mat[2 , i])
476+ X2 = SA. SVector {2} (x_mat[1 , j], x_mat[2 , j])
477+ U1 = SA. SVector {2} (u_mat[1 , i], u_mat[2 , i])
478+ U2 = SA. SVector {2} (u_mat[1 , j], u_mat[2 , j])
479+
480+ dX = X2 - X1
481+ dist = sqrt (dX[1 ]^ 2 + dX[2 ]^ 2 )
482+
483+ bin = SFH. digitize (dist, distance_bins)
484+
485+ if 1 <= bin < N_bins
486+ dU = U2 - U1
487+ r̂ = dX / dist
488+ n̂ = SA. SVector {2, eltype(x_mat)} (r̂[2 ], - r̂[1 ])
489+
490+ du_L = SA. dot (dU, r̂)
491+ du_T = SA. dot (dU, n̂)
492+
493+ du_L2 = du_L * du_L
494+ du_T2 = du_T * du_T
495+
496+ # Atomically accumulate the 8 structure functions
497+ @atomic output_sums[1 , bin] += du_L2 + du_T2
498+ @atomic output_sums[2 , bin] += du_L2
499+ @atomic output_sums[3 , bin] += du_T2
500+ @atomic output_sums[4 , bin] += du_L * (du_L2 + du_T2)
501+ @atomic output_sums[5 , bin] += du_L * du_L2
502+ @atomic output_sums[6 , bin] += du_L2 * du_T
503+ @atomic output_sums[7 , bin] += du_L * du_T2
504+ @atomic output_sums[8 , bin] += du_T * du_T2
505+
506+ for t in 1 : 8
507+ @atomic output_counts[t, bin] += one (eltype (output_counts))
508+ end
509+ end
510+ end
511+ end
512+
513+ """
514+ SFC._dispatch_single_pass(::GPUBackend, x, u, distance_bins; workgroup_size=64, kwargs...)
515+
516+ Calculates single-pass structure functions utilizing GPU-accelerated computing.
517+ """
518+ function SFC. _dispatch_single_pass (
519+ gpu_backend:: SF.GPUBackend ,
520+ x:: AbstractMatrix{FT1} ,
521+ u:: AbstractMatrix{FT2} ,
522+ distance_bins:: AbstractVector{FT3} ;
523+ workgroup_size:: Int = 64 ,
524+ kwargs...
525+ ) where {FT1 <: Number , FT2 <: Number , FT3 <: Number }
526+ backend = gpu_backend. backend
527+ FT = promote_type (float (FT1), float (FT2))
528+ N_dims, N_points = size (x)
529+ n_bins = length (distance_bins) - 1
530+
531+ if N_dims != 2
532+ error (" GPUExt: single-pass calculation only supports 2D coordinates (got N_dims=$N_dims )" )
533+ end
534+
535+ # Allocate device arrays
536+ x_dev = KA. allocate (backend, FT, 2 , N_points)
537+ u_dev = KA. allocate (backend, FT, 2 , N_points)
538+ bins_dev = KA. allocate (backend, FT, length (distance_bins))
539+ out_sums_dev = KA. zeros (backend, FT, 8 , n_bins)
540+ out_cnts_dev = KA. zeros (backend, FT, 8 , n_bins)
541+
542+ copyto! (x_dev, collect (x))
543+ copyto! (u_dev, collect (u))
544+ copyto! (bins_dev, collect (distance_bins))
545+
546+ kernel! = _sf_single_pass_kernel! (backend, workgroup_size)
547+ kernel! (
548+ out_sums_dev, out_cnts_dev,
549+ x_dev, u_dev,
550+ bins_dev,
551+ N_points, length (distance_bins);
552+ ndrange = (N_points, N_points)
553+ )
554+ KA. synchronize (backend)
555+
556+ sums = Array (out_sums_dev)
557+ counts = Int64 .(Array (out_cnts_dev))
558+
559+ return SFC. postprocess_single_pass_results (sums, counts, distance_bins)
560+ end
456561
457562end # module GPUExt
0 commit comments