1111"""
1212from __future__ import annotations
1313
14- from typing import Optional , Sequence , Tuple , Union
14+ from typing import Optional , Tuple , Union
1515
1616import numpy as np
1717import xarray as xr
@@ -347,6 +347,49 @@ def _sort_edges(edge_arrays):
347347
348348def _extract_points (geometries , values , bounds , height , width ):
349349 """Parse Point/MultiPoint geometries into pixel coordinate arrays."""
350+ if not geometries :
351+ return (np .empty (0 , np .int32 ), np .empty (0 , np .int32 ),
352+ np .empty (0 , np .float64 ))
353+ if _HAS_SHAPELY2 :
354+ return _extract_points_vectorized (
355+ geometries , values , bounds , height , width )
356+ return _extract_points_loop (
357+ geometries , values , bounds , height , width )
358+
359+
360+ def _extract_points_vectorized (geometries , values , bounds , height , width ):
361+ """Vectorized point extraction using shapely 2.0 array ops."""
362+ import shapely
363+
364+ xmin , ymin , xmax , ymax = bounds
365+ px = (xmax - xmin ) / width
366+ py = (ymax - ymin ) / height
367+
368+ geom_arr = np .array (geometries , dtype = object )
369+ val_arr = np .array (values , dtype = np .float64 )
370+
371+ # Explode MultiPoints to individual Points
372+ parts , part_idx = shapely .get_parts (geom_arr , return_index = True )
373+ part_vals = val_arr [part_idx ]
374+
375+ if len (parts ) == 0 :
376+ return (np .empty (0 , np .int32 ), np .empty (0 , np .int32 ),
377+ np .empty (0 , np .float64 ))
378+
379+ # Extract coordinates with index back to each point
380+ coords , coord_idx = shapely .get_coordinates (
381+ parts , return_index = True )
382+ pt_vals = part_vals [coord_idx ]
383+
384+ cols = np .floor ((coords [:, 0 ] - xmin ) / px ).astype (np .int32 )
385+ rows = np .floor ((ymax - coords [:, 1 ]) / py ).astype (np .int32 )
386+
387+ valid = (rows >= 0 ) & (rows < height ) & (cols >= 0 ) & (cols < width )
388+ return (rows [valid ], cols [valid ], pt_vals [valid ])
389+
390+
391+ def _extract_points_loop (geometries , values , bounds , height , width ):
392+ """Loop-based point extraction (shapely < 2.0 fallback)."""
350393 xmin , ymin , xmax , ymax = bounds
351394 px = (xmax - xmin ) / width
352395 py = (ymax - ymin ) / height
@@ -382,8 +425,154 @@ def _extract_points(geometries, values, bounds, height, width):
382425# Line segment extraction (always on host)
383426# ---------------------------------------------------------------------------
384427
428+ _EMPTY_LINES = (np .empty (0 , np .int32 ), np .empty (0 , np .int32 ),
429+ np .empty (0 , np .int32 ), np .empty (0 , np .int32 ),
430+ np .empty (0 , np .float64 ))
431+
432+
385433def _extract_line_segments (geometries , values , bounds , height , width ):
386- """Parse LineString/MultiLineString geometries into pixel-space segments."""
434+ """Parse LineString/MultiLineString geometries into pixel-space segments.
435+
436+ Segments are clipped to the raster extent before conversion to pixel
437+ coordinates, so Bresenham never iterates over out-of-bounds pixels.
438+ """
439+ if not geometries :
440+ return _EMPTY_LINES
441+ if _HAS_SHAPELY2 :
442+ return _extract_lines_vectorized (
443+ geometries , values , bounds , height , width )
444+ return _extract_lines_loop (
445+ geometries , values , bounds , height , width )
446+
447+
448+ def _liang_barsky_clip (x0 , y0 , x1 , y1 , xmin , ymin , xmax , ymax ):
449+ """Liang-Barsky line clipping. Returns clipped (x0,y0,x1,y1) or None."""
450+ dx = x1 - x0
451+ dy = y1 - y0
452+ p = np .array ([- dx , dx , - dy , dy ])
453+ q = np .array ([x0 - xmin , xmax - x0 , y0 - ymin , ymax - y0 ])
454+
455+ t0 , t1 = 0.0 , 1.0
456+ for i in range (4 ):
457+ if p [i ] == 0.0 :
458+ if q [i ] < 0.0 :
459+ return None
460+ elif p [i ] < 0.0 :
461+ t = q [i ] / p [i ]
462+ if t > t1 :
463+ return None
464+ if t > t0 :
465+ t0 = t
466+ else :
467+ t = q [i ] / p [i ]
468+ if t < t0 :
469+ return None
470+ if t < t1 :
471+ t1 = t
472+
473+ cx0 = x0 + t0 * dx
474+ cy0 = y0 + t0 * dy
475+ cx1 = x0 + t1 * dx
476+ cy1 = y0 + t1 * dy
477+ return cx0 , cy0 , cx1 , cy1
478+
479+
480+ def _extract_lines_vectorized (geometries , values , bounds , height , width ):
481+ """Vectorized line extraction with Liang-Barsky clipping."""
482+ import shapely
483+
484+ xmin , ymin , xmax , ymax = bounds
485+ px = (xmax - xmin ) / width
486+ py = (ymax - ymin ) / height
487+
488+ geom_arr = np .array (geometries , dtype = object )
489+ val_arr = np .array (values , dtype = np .float64 )
490+
491+ # Explode MultiLineStrings to individual LineStrings
492+ parts , part_idx = shapely .get_parts (geom_arr , return_index = True )
493+ part_vals = val_arr [part_idx ]
494+
495+ if len (parts ) == 0 :
496+ return _EMPTY_LINES
497+
498+ # Get all vertex coordinates with line membership
499+ coords , coord_line_idx = shapely .get_coordinates (
500+ parts , return_index = True )
501+ n_coords = len (coords )
502+ if n_coords < 2 :
503+ return _EMPTY_LINES
504+
505+ # Mark last coordinate of each line (don't form cross-line segments)
506+ is_last = np .zeros (n_coords , dtype = bool )
507+ changes = np .nonzero (np .diff (coord_line_idx ))[0 ]
508+ is_last [changes ] = True
509+ is_last [- 1 ] = True
510+
511+ # Segments: from each non-last coordinate to its successor
512+ start_idx = np .nonzero (~ is_last )[0 ]
513+ end_idx = start_idx + 1
514+ seg_vals = part_vals [coord_line_idx [start_idx ]]
515+
516+ # World-space segment endpoints
517+ x0 = coords [start_idx , 0 ]
518+ y0 = coords [start_idx , 1 ]
519+ x1 = coords [end_idx , 0 ]
520+ y1 = coords [end_idx , 1 ]
521+
522+ # Vectorized Liang-Barsky clip to raster bounds
523+ dx = x1 - x0
524+ dy = y1 - y0
525+
526+ # p and q arrays: shape (4, n_segments)
527+ p = np .array ([- dx , dx , - dy , dy ])
528+ q = np .array ([x0 - xmin , xmax - x0 , y0 - ymin , ymax - y0 ])
529+
530+ t0 = np .zeros (len (x0 ))
531+ t1 = np .ones (len (x0 ))
532+ valid = np .ones (len (x0 ), dtype = bool )
533+
534+ for i in range (4 ):
535+ parallel = p [i ] == 0.0
536+ outside = parallel & (q [i ] < 0.0 )
537+ valid &= ~ outside
538+
539+ neg = (~ parallel ) & (p [i ] < 0.0 )
540+ pos = (~ parallel ) & (p [i ] > 0.0 )
541+
542+ with np .errstate (divide = 'ignore' , invalid = 'ignore' ):
543+ t_neg = np .where (neg , q [i ] / p [i ], 0.0 )
544+ t_pos = np .where (pos , q [i ] / p [i ], 1.0 )
545+
546+ t0 = np .where (neg , np .maximum (t0 , t_neg ), t0 )
547+ t1 = np .where (pos , np .minimum (t1 , t_pos ), t1 )
548+
549+ valid &= (t0 <= t1 )
550+
551+ # Apply clipping
552+ cx0 = x0 + t0 * dx
553+ cy0 = y0 + t0 * dy
554+ cx1 = x0 + t1 * dx
555+ cy1 = y0 + t1 * dy
556+
557+ # Convert to pixel space and floor to int32
558+ r0 = np .floor ((ymax - cy0 ) / py ).astype (np .int32 )
559+ c0 = np .floor ((cx0 - xmin ) / px ).astype (np .int32 )
560+ r1 = np .floor ((ymax - cy1 ) / py ).astype (np .int32 )
561+ c1 = np .floor ((cx1 - xmin ) / px ).astype (np .int32 )
562+
563+ # Clamp edge cases (clipping guarantees in-bounds but float rounding
564+ # at exact boundaries can produce height or width)
565+ np .clip (r0 , 0 , height - 1 , out = r0 )
566+ np .clip (c0 , 0 , width - 1 , out = c0 )
567+ np .clip (r1 , 0 , height - 1 , out = r1 )
568+ np .clip (c1 , 0 , width - 1 , out = c1 )
569+
570+ v = valid
571+ return (r0 [v ], c0 [v ], r1 [v ], c1 [v ], seg_vals [v ])
572+
573+
574+ def _extract_lines_loop (geometries , values , bounds , height , width ):
575+ """Loop-based line extraction with Liang-Barsky clipping (fallback)."""
387576 xmin , ymin , xmax , ymax = bounds
388577 px = (xmax - xmin ) / width
389578 py = (ymax - ymin ) / height
@@ -401,19 +590,26 @@ def _extract_line_segments(geometries, values, bounds, height, width):
401590 continue
402591 for line in lines :
403592 coords = np .asarray (line .coords )
404- rows = (ymax - coords [:, 1 ]) / py
405- cols = (coords [:, 0 ] - xmin ) / px
406593 for i in range (len (coords ) - 1 ):
407- all_r0 .append (np .int32 (int (np .floor (rows [i ]))))
408- all_c0 .append (np .int32 (int (np .floor (cols [i ]))))
409- all_r1 .append (np .int32 (int (np .floor (rows [i + 1 ]))))
410- all_c1 .append (np .int32 (int (np .floor (cols [i + 1 ]))))
594+ clipped = _liang_barsky_clip (
595+ coords [i , 0 ], coords [i , 1 ],
596+ coords [i + 1 , 0 ], coords [i + 1 , 1 ],
597+ xmin , ymin , xmax , ymax )
598+ if clipped is None :
599+ continue
600+ cx0 , cy0 , cx1 , cy1 = clipped
601+ r0 = min (max (int (np .floor ((ymax - cy0 ) / py )), 0 ), height - 1 )
602+ c0 = min (max (int (np .floor ((cx0 - xmin ) / px )), 0 ), width - 1 )
603+ r1 = min (max (int (np .floor ((ymax - cy1 ) / py )), 0 ), height - 1 )
604+ c1 = min (max (int (np .floor ((cx1 - xmin ) / px )), 0 ), width - 1 )
605+ all_r0 .append (np .int32 (r0 ))
606+ all_c0 .append (np .int32 (c0 ))
607+ all_r1 .append (np .int32 (r1 ))
608+ all_c1 .append (np .int32 (c1 ))
411609 all_vals .append (np .float64 (val ))
412610
413611 if not all_r0 :
414- return (np .empty (0 , np .int32 ), np .empty (0 , np .int32 ),
415- np .empty (0 , np .int32 ), np .empty (0 , np .int32 ),
416- np .empty (0 , np .float64 ))
612+ return _EMPTY_LINES
417613 return (np .array (all_r0 , np .int32 ), np .array (all_c0 , np .int32 ),
418614 np .array (all_r1 , np .int32 ), np .array (all_c1 , np .int32 ),
419615 np .array (all_vals , np .float64 ))
@@ -649,26 +845,22 @@ def _merge_pixel_gpu(out, written, r, c, val, mode):
649845 out [r , c ] = out [r , c ] + 1.0
650846
651847 @cuda .jit
652- def _scanline_fill_gpu (out , written , edge_y_min , edge_y_max ,
653- edge_x_at_ymin , edge_inv_slope , edge_value ,
654- edge_geom_id , n_edges , width , mode ):
655- """CUDA kernel: one thread per raster row."""
848+ def _scanline_fill_gpu (out , written , edge_y_min , edge_x_at_ymin ,
849+ edge_inv_slope , edge_value , edge_geom_id ,
850+ row_ptr , col_idx , width , mode ):
851+ """CUDA kernel: one thread per raster row, CSR-indexed active edges.
852+
853+ Instead of binary-searching the sorted edge table and scanning
854+ through dead edges, each thread reads its active edge list
855+ directly from the precomputed CSR structure (row_ptr, col_idx).
856+ """
656857 row = cuda .grid (1 )
657858 if row >= out .shape [0 ]:
658859 return
659860
660- lo , hi = 0 , n_edges
661- while lo < hi :
662- mid = (lo + hi ) // 2
663- if edge_y_min [mid ] <= row :
664- lo = mid + 1
665- else :
666- hi = mid
667-
668- count = 0
669- for e in range (hi ):
670- if edge_y_max [e ] >= row :
671- count += 1
861+ start = row_ptr [row ]
862+ end = row_ptr [row + 1 ]
863+ count = end - start
672864
673865 if count == 0 :
674866 return
@@ -681,18 +873,16 @@ def _scanline_fill_gpu(out, written, edge_y_min, edge_y_max,
681873 vs = cuda .local .array (512 , dtype = np .float64 )
682874 gs = cuda .local .array (512 , dtype = np .int32 )
683875
684- idx = 0
685- for e in range (hi ):
686- if idx >= MAX_ISECT :
876+ actual = 0
877+ for k in range (start , end ):
878+ if actual >= MAX_ISECT :
687879 break
688- if edge_y_max [e ] >= row :
689- xs [idx ] = (edge_x_at_ymin [e ]
690- + (row - edge_y_min [e ]) * edge_inv_slope [e ])
691- vs [idx ] = edge_value [e ]
692- gs [idx ] = edge_geom_id [e ]
693- idx += 1
694-
695- actual = idx
880+ e = col_idx [k ]
881+ xs [actual ] = (edge_x_at_ymin [e ]
882+ + (row - edge_y_min [e ]) * edge_inv_slope [e ])
883+ vs [actual ] = edge_value [e ]
884+ gs [actual ] = edge_geom_id [e ]
885+ actual += 1
696886
697887 # Insertion sort by (geom_id, x)
698888 for i in range (1 , actual ):
@@ -794,6 +984,46 @@ def _burn_lines_gpu(out, written, r0_arr, c0_arr, r1_arr, c1_arr, vals,
794984 return _gpu_kernels
795985
796986
987+ @ngjit
988+ def _build_row_csr_numba (edge_y_min , edge_y_max , height ):
989+ """Numba-accelerated CSR builder for GPU scanline precomputation."""
990+ n_edges = len (edge_y_min )
991+
992+ # Pass 1: count active edges per row
993+ counts = np .zeros (height , dtype = np .int32 )
994+ for e in range (n_edges ):
995+ y_lo = edge_y_min [e ]
996+ y_hi = edge_y_max [e ]
997+ if y_hi >= height :
998+ y_hi = height - 1
999+ for r in range (y_lo , y_hi + 1 ):
1000+ counts [r ] += 1
1001+
1002+ # Build row_ptr (prefix sum)
1003+ row_ptr = np .empty (height + 1 , dtype = np .int32 )
1004+ row_ptr [0 ] = 0
1005+ for r in range (height ):
1006+ row_ptr [r + 1 ] = row_ptr [r ] + counts [r ]
1007+
1008+ # Pass 2: fill col_idx
1009+ total = row_ptr [height ]
1010+ col_idx = np .empty (total , dtype = np .int32 )
1011+ offsets = np .empty (height , dtype = np .int32 )
1012+ for r in range (height ):
1013+ offsets [r ] = row_ptr [r ]
1014+
1015+ for e in range (n_edges ):
1016+ y_lo = edge_y_min [e ]
1017+ y_hi = edge_y_max [e ]
1018+ if y_hi >= height :
1019+ y_hi = height - 1
1020+ for r in range (y_lo , y_hi + 1 ):
1021+ col_idx [offsets [r ]] = e
1022+ offsets [r ] += 1
1023+
1024+ return row_ptr , col_idx
1025+
1026+
7971027def _run_cupy (geometries , values , bounds , height , width , fill , dtype ,
7981028 all_touched , merge_mode ):
7991029 """CuPy backend for rasterize."""
@@ -816,18 +1046,23 @@ def _run_cupy(geometries, values, bounds, height, width, fill, dtype,
8161046 edge_value , edge_geom_id = edge_arrays
8171047
8181048 if len (edge_y_min ) > 0 :
1049+ # Build CSR structure on CPU, then transfer to GPU
1050+ row_ptr , col_idx = _build_row_csr_numba (
1051+ edge_y_min , edge_y_max , height )
1052+
8191053 d_y_min = cupy .asarray (edge_y_min )
820- d_y_max = cupy .asarray (edge_y_max )
8211054 d_x_at_ymin = cupy .asarray (edge_x_at_ymin )
8221055 d_inv_slope = cupy .asarray (edge_inv_slope )
8231056 d_value = cupy .asarray (edge_value )
8241057 d_geom_id = cupy .asarray (edge_geom_id )
1058+ d_row_ptr = cupy .asarray (row_ptr )
1059+ d_col_idx = cupy .asarray (col_idx )
8251060
8261061 tpb = 256
8271062 blocks = (height + tpb - 1 ) // tpb
8281063 kernels ['scanline_fill' ][blocks , tpb ](
829- out , written , d_y_min , d_y_max , d_x_at_ymin , d_inv_slope ,
830- d_value , d_geom_id , len ( edge_y_min ) , width , merge_mode )
1064+ out , written , d_y_min , d_x_at_ymin , d_inv_slope ,
1065+ d_value , d_geom_id , d_row_ptr , d_col_idx , width , merge_mode )
8311066
8321067 # 2. Lines
8331068 r0 , c0 , r1 , c1 , lvals = _extract_line_segments (
0 commit comments