xarray-contrib
diff --git a/‎xrspatial/reproject/__init__.py‎
Lines changed: 125 additions & 39 deletions b/‎xrspatial/reproject/__init__.py‎
Lines changed: 125 additions & 39 deletions
@@ -63,6 +63,75 @@ def _is_y_descending(raster):
     return float(y[0]) > float(y[-1])
 
 
+# ---------------------------------------------------------------------------
+# Per-chunk coordinate transform
+# ---------------------------------------------------------------------------
+
+def _transform_coords(transformer, chunk_bounds, chunk_shape,
+                      transform_precision, src_crs=None, tgt_crs=None):
+    """Compute source CRS coordinates for every output pixel.
+
+    When *transform_precision* is 0, every pixel is transformed through
+    pyproj exactly (same strategy as GDAL/rasterio).  Otherwise an
+    approximate bilinear control-grid interpolation is used.
+
+    For common CRS pairs (WGS84/NAD83 <-> UTM, WGS84 <-> Web Mercator),
+    a Numba JIT fast path bypasses pyproj entirely for ~30x speedup.
+
+    Returns
+    -------
+    src_y, src_x : ndarray (height, width)
+    """
+    # Try Numba fast path for common projections
+    if src_crs is not None and tgt_crs is not None:
+        try:
+            from ._projections import try_numba_transform
+            result = try_numba_transform(
+                src_crs, tgt_crs, chunk_bounds, chunk_shape,
+            )
+            if result is not None:
+                return result
+        except Exception:
+            pass  # fall through to pyproj
+
+    height, width = chunk_shape
+    left, bottom, right, top = chunk_bounds
+    res_x = (right - left) / width
+    res_y = (top - bottom) / height
+
+    if transform_precision == 0:
+        # Exact per-pixel transform via pyproj bulk API.
+        # Process in row strips to keep memory bounded and improve
+        # cache locality for large rasters.
+        out_x_1d = left + (np.arange(width, dtype=np.float64) + 0.5) * res_x
+        src_x_out = np.empty((height, width), dtype=np.float64)
+        src_y_out = np.empty((height, width), dtype=np.float64)
+        strip = 256
+        for r0 in range(0, height, strip):
+            r1 = min(r0 + strip, height)
+            n_rows = r1 - r0
+            out_y_strip = top - (np.arange(r0, r1, dtype=np.float64) + 0.5) * res_y
+            # Broadcast to (n_rows, width) without allocating a full copy
+            sx, sy = transformer.transform(
+                np.tile(out_x_1d, n_rows),
+                np.repeat(out_y_strip, width),
+            )
+            src_x_out[r0:r1] = np.asarray(sx, dtype=np.float64).reshape(n_rows, width)
+            src_y_out[r0:r1] = np.asarray(sy, dtype=np.float64).reshape(n_rows, width)
+        return src_y_out, src_x_out
+
+    # Approximate: bilinear interpolation on a coarse control grid.
+    approx = ApproximateTransform(
+        transformer, chunk_bounds, chunk_shape,
+        precision=transform_precision,
+    )
+    row_grid = np.arange(height, dtype=np.float64)[:, np.newaxis]
+    col_grid = np.arange(width, dtype=np.float64)[np.newaxis, :]
+    row_grid = np.broadcast_to(row_grid, (height, width))
+    col_grid = np.broadcast_to(col_grid, (height, width))
+    return approx(row_grid, col_grid)
+
+
 # ---------------------------------------------------------------------------
 # Per-chunk worker functions
 # ---------------------------------------------------------------------------
@@ -89,20 +158,11 @@ def _reproject_chunk_numpy(
         tgt_crs, src_crs, always_xy=True
     )
 
-    height, width = chunk_shape
-    approx = ApproximateTransform(
-        transformer, chunk_bounds_tuple, chunk_shape,
-        precision=transform_precision,
-    )
-
-    # All output pixel positions (broadcast 1-D arrays to avoid HxW meshgrid)
-    row_grid = np.arange(height, dtype=np.float64)[:, np.newaxis]
-    col_grid = np.arange(width, dtype=np.float64)[np.newaxis, :]
-    row_grid = np.broadcast_to(row_grid, (height, width))
-    col_grid = np.broadcast_to(col_grid, (height, width))
-
     # Source CRS coordinates for each output pixel
-    src_y, src_x = approx(row_grid, col_grid)
+    src_y, src_x = _transform_coords(
+        transformer, chunk_bounds_tuple, chunk_shape, transform_precision,
+        src_crs=src_crs, tgt_crs=tgt_crs,
+    )
 
     # Convert source CRS coordinates to source pixel coordinates
     src_left, src_bottom, src_right, src_top = source_bounds_tuple
@@ -170,35 +230,59 @@ def _reproject_chunk_cupy(
         tgt_crs, src_crs, always_xy=True
     )
 
-    height, width = chunk_shape
-    approx = ApproximateTransform(
-        transformer, chunk_bounds_tuple, chunk_shape,
-        precision=transform_precision,
-    )
-
-    row_grid = np.arange(height, dtype=np.float64)[:, np.newaxis]
-    col_grid = np.arange(width, dtype=np.float64)[np.newaxis, :]
-    row_grid = np.broadcast_to(row_grid, (height, width))
-    col_grid = np.broadcast_to(col_grid, (height, width))
+    # Try CUDA transform first (keeps coordinates on-device)
+    cuda_result = None
+    if src_crs is not None and tgt_crs is not None:
+        try:
+            from ._projections_cuda import try_cuda_transform
+            cuda_result = try_cuda_transform(
+                src_crs, tgt_crs, chunk_bounds_tuple, chunk_shape,
+            )
+        except Exception:
+            pass
 
-    # Control grid is on CPU
-    src_y, src_x = approx(row_grid, col_grid)
+    if cuda_result is not None:
+        src_y, src_x = cuda_result  # cupy arrays
+        src_left, src_bottom, src_right, src_top = source_bounds_tuple
+        src_h, src_w = source_shape
+        src_res_x = (src_right - src_left) / src_w
+        src_res_y = (src_top - src_bottom) / src_h
+        # Pixel coordinate math stays on GPU via cupy operators
+        src_col_px = (src_x - src_left) / src_res_x - 0.5
+        if source_y_desc:
+            src_row_px = (src_top - src_y) / src_res_y - 0.5
+        else:
+            src_row_px = (src_y - src_bottom) / src_res_y - 0.5
+        # Need min/max on CPU for window selection
+        r_min = int(cp.floor(cp.nanmin(src_row_px)).get()) - 2
+        r_max = int(cp.ceil(cp.nanmax(src_row_px)).get()) + 3
+        c_min = int(cp.floor(cp.nanmin(src_col_px)).get()) - 2
+        c_max = int(cp.ceil(cp.nanmax(src_col_px)).get()) + 3
+        # Convert to numpy for downstream resampling
+        src_row_px = cp.asnumpy(src_row_px)
+        src_col_px = cp.asnumpy(src_col_px)
+    else:
+        # CPU fallback (Numba JIT or pyproj)
+        src_y, src_x = _transform_coords(
+            transformer, chunk_bounds_tuple, chunk_shape, transform_precision,
+            src_crs=src_crs, tgt_crs=tgt_crs,
+        )
 
-    src_left, src_bottom, src_right, src_top = source_bounds_tuple
-    src_h, src_w = source_shape
-    src_res_x = (src_right - src_left) / src_w
-    src_res_y = (src_top - src_bottom) / src_h
+        src_left, src_bottom, src_right, src_top = source_bounds_tuple
+        src_h, src_w = source_shape
+        src_res_x = (src_right - src_left) / src_w
+        src_res_y = (src_top - src_bottom) / src_h
 
-    src_col_px = (src_x - src_left) / src_res_x - 0.5
-    if source_y_desc:
-        src_row_px = (src_top - src_y) / src_res_y - 0.5
-    else:
-        src_row_px = (src_y - src_bottom) / src_res_y - 0.5
+        src_col_px = (src_x - src_left) / src_res_x - 0.5
+        if source_y_desc:
+            src_row_px = (src_top - src_y) / src_res_y - 0.5
+        else:
+            src_row_px = (src_y - src_bottom) / src_res_y - 0.5
 
-    r_min = int(np.floor(np.nanmin(src_row_px))) - 2
-    r_max = int(np.ceil(np.nanmax(src_row_px))) + 3
-    c_min = int(np.floor(np.nanmin(src_col_px))) - 2
-    c_max = int(np.ceil(np.nanmax(src_col_px))) + 3
+        r_min = int(np.floor(np.nanmin(src_row_px))) - 2
+        r_max = int(np.ceil(np.nanmax(src_row_px))) + 3
+        c_min = int(np.floor(np.nanmin(src_col_px))) - 2
+        c_max = int(np.ceil(np.nanmax(src_col_px))) + 3
 
     if r_min >= src_h or r_max <= 0 or c_min >= src_w or c_max <= 0:
         return cp.full(chunk_shape, nodata, dtype=cp.float64)
@@ -271,7 +355,9 @@ def reproject(
     nodata : float or None
         Nodata value. Auto-detected if None.
     transform_precision : int
-        Coarse grid subdivisions for approximate transform (default 16).
+        Control-grid subdivisions for the coordinate transform (default 16).
+        Higher values increase accuracy at the cost of more pyproj calls.
+        Set to 0 for exact per-pixel transforms matching GDAL/rasterio.
     chunk_size : int or (int, int) or None
         Output chunk size for dask. Defaults to 512.
     name : str or None