Add NaN-skipping to focal_stats CUDA kernels (#1092)

brendancol · brendancol · commit b627c89de898 · 2026-03-30T11:56:05.000-07:00
All focal_stats CUDA kernels (_focal_mean_cuda, _focal_sum_cuda,
_focal_std_cuda, _focal_var_cuda, _focal_range_cuda, _focal_min_cuda,
_focal_max_cuda) now skip NaN neighbors with `if v != v: continue`,
matching the numpy path which uses np.nanmean/nansum/nanstd/etc.

Previously, NaN propagated through arithmetic, giving different
results on GPU vs CPU when input contained NaN.
diff --git a/xrspatial/focal.py b/xrspatial/focal.py
@@ -608,13 +608,13 @@ def _focal_min_cuda(data, kernel, out):
 
             if 0 <= ii < rows and 0 <= jj < cols:
                 v = data[ii, jj]
+                if v != v:  # NaN check
+                    continue
                 if (not found) or (v < m):
                     m = v
                     found = True
 
-    # With your mask containing the center, found should be True.
-    # But keep a safe fallback anyway.
-    out[i, j] = m if found else data[i, j]
+    out[i, j] = m if found else math.nan
 
 
 @cuda.jit
@@ -636,20 +636,20 @@ def _focal_max_cuda(data, kernel, out):
         for h in range(kernel.shape[1]):
             w = kernel[k, h]
             if w == 0:
-                continue  # mask says "ignore this neighbor"
+                continue
 
             ii = i + k - dr
             jj = j + h - dc
 
             if 0 <= ii < rows and 0 <= jj < cols:
                 v = data[ii, jj]
+                if v != v:  # NaN check
+                    continue
                 if (not found) or (v > m):
                     m = v
                     found = True
 
-    # With your mask containing the center (1), found should always be True.
-    # But keep this for safety.
-    out[i, j] = m if found else data[i, j]
+    out[i, j] = m if found else math.nan
 
 
 def _focal_range_cupy(data, kernel):
@@ -684,6 +684,8 @@ def _focal_range_cuda(data, kernel, out):
 
             if 0 <= ii < rows and 0 <= jj < cols:
                 v = data[ii, jj]
+                if v != v:  # NaN check
+                    continue
                 if not found:
                     mx = v
                     mn = v
@@ -694,7 +696,7 @@ def _focal_range_cuda(data, kernel, out):
                     if v < mn:
                         mn = v
 
-    out[i, j] = (mx - mn) if found else 0.0
+    out[i, j] = (mx - mn) if found else math.nan
 
 
 @cuda.jit
@@ -716,29 +718,29 @@ def _focal_std_cuda(data, kernel, out):
         for h in range(kernel.shape[1]):
             w = kernel[k, h]
             if w == 0:
-                continue  # mask says ignore
+                continue
 
             ii = i + k - dr
             jj = j + h - dc
 
             if 0 <= ii < rows and 0 <= jj < cols:
                 x = data[ii, jj]
+                if x != x:  # NaN check
+                    continue
                 w_sum += w
                 sum_wx += w * x
                 sum_wx2 += w * x * x
 
-    # With your mask including the center, w_sum should be > 0. Guard anyway.
     if w_sum > 0.0:
         mean = sum_wx / w_sum
         var = (sum_wx2 / w_sum) - (mean * mean)
 
-        # Numerical safety (tiny negative due to floating point)
         if var < 0.0:
             var = 0.0
 
         out[i, j] = math.sqrt(var)
     else:
-        out[i, j] = 0.0
+        out[i, j] = math.nan
 
 
 @cuda.jit
@@ -760,13 +762,15 @@ def _focal_var_cuda(data, kernel, out):
         for h in range(kernel.shape[1]):
             w = kernel[k, h]
             if w == 0:
-                continue  # mask says ignore
+                continue
 
             ii = i + k - dr
             jj = j + h - dc
 
             if 0 <= ii < rows and 0 <= jj < cols:
                 x = data[ii, jj]
+                if x != x:  # NaN check
+                    continue
                 w_sum += w
                 sum_wx += w * x
                 sum_wx2 += w * x * x
@@ -775,13 +779,12 @@ def _focal_var_cuda(data, kernel, out):
         mean = sum_wx / w_sum
         var = (sum_wx2 / w_sum) - (mean * mean)
 
-        # numerical guard for tiny negative due to float rounding
         if var < 0.0:
             var = 0.0
 
         out[i, j] = var
     else:
-        out[i, j] = 0.0
+        out[i, j] = math.nan
 
 
 @cuda.jit
@@ -852,19 +855,24 @@ def _focal_sum_cuda(data, kernel, out):
     dc = kernel.shape[1] // 2
 
     s = 0.0
+    found = False
     for k in range(kernel.shape[0]):
         for h in range(kernel.shape[1]):
             w = kernel[k, h]
             if w == 0:
-                continue  # mask says ignore
+                continue
 
             ii = i + k - dr
             jj = j + h - dc
 
             if 0 <= ii < rows and 0 <= jj < cols:
-                s += w * data[ii, jj]
+                v = data[ii, jj]
+                if v != v:  # NaN check
+                    continue
+                s += w * v
+                found = True
 
-    out[i, j] = s
+    out[i, j] = s if found else math.nan
 
 
 def _focal_stats_func_cupy(data, kernel, func=_focal_max_cuda):
@@ -894,21 +902,22 @@ def _focal_mean_cuda(data, kernel, out):
         for h in range(kernel.shape[1]):
             w = kernel[k, h]
             if w == 0:
-                continue  # mask says ignore
+                continue
 
             ii = i + k - dr
             jj = j + h - dc
 
             if 0 <= ii < rows and 0 <= jj < cols:
-                s += w * data[ii, jj]
+                v = data[ii, jj]
+                if v != v:  # NaN check
+                    continue
+                s += w * v
                 w_sum += w
 
-    # With your mask including the center, w_sum should be > 0.
-    # Guard anyway to avoid divide-by-zero.
     if w_sum > 0.0:
         out[i, j] = s / w_sum
     else:
-        out[i, j] = data[i, j]
+        out[i, j] = math.nan
 
 
 def _focal_stats_cupy(agg, kernel, stats_funcs):