|
37 | 37 | import numpy as np |
38 | 38 | from typing_extensions import Self, override |
39 | 39 |
|
| 40 | +from pytools import memoize_method |
| 41 | + |
40 | 42 | from arraycontext.container.traversal import ( |
41 | 43 | rec_map_array_container, |
42 | 44 | rec_map_container, |
|
62 | 64 | if TYPE_CHECKING: |
63 | 65 | from collections.abc import Callable, Mapping |
64 | 66 |
|
65 | | - from numpy.typing import NDArray |
| 67 | + from numpy.typing import DTypeLike, NDArray |
66 | 68 |
|
67 | 69 | import loopy as lp |
68 | 70 | import pyopencl as cl |
@@ -263,12 +265,54 @@ def to_numpy(self, array: Array) -> np.ndarray: |
263 | 265 | def to_numpy(self, array: ContainerOrScalarT) -> ContainerOrScalarT: |
264 | 266 | ... |
265 | 267 |
|
| 268 | + @memoize_method |
| 269 | + def _get_to_numpy_noncontiguous_copy_kernel( |
| 270 | + self, dtype: DTypeLike, ndim: int |
| 271 | + ) -> lp.TranslationUnit: |
| 272 | + """ |
| 273 | + Returns a translation unit containing a loopy kernel that: |
| 274 | +
|
| 275 | + - Accepts a PyOpenCL array ``inp`` with per-axis strides exposed as |
| 276 | + ``s0, s1, ..., s{ndim-1}``. |
| 277 | + - Produces a contiguous, row-major (C-order) output array ``output`` of |
| 278 | + the same shape, with elements copied from the corresponding |
| 279 | + coordinates in ``input``. |
| 280 | + """ |
| 281 | + |
| 282 | + import loopy as lp |
| 283 | + |
| 284 | + from arraycontext.loopy import _DEFAULT_LOOPY_OPTIONS |
| 285 | + |
| 286 | + t_unit = lp.make_copy_kernel( |
| 287 | + ["c"] * ndim, [f"stride:s{i}" for i in range(ndim)] |
| 288 | + ) |
| 289 | + t_unit = lp.add_dtypes(t_unit, {"input": dtype}) |
| 290 | + new_args = [ |
| 291 | + *t_unit.default_entrypoint.args, |
| 292 | + *[lp.ValueArg(f"s{i}", dtype=np.uint64) for i in range(ndim)], |
| 293 | + ] |
| 294 | + t_unit = t_unit.with_kernel(t_unit.default_entrypoint.copy(args=new_args)) |
| 295 | + t_unit = lp.set_options(t_unit, _DEFAULT_LOOPY_OPTIONS) |
| 296 | + return t_unit |
| 297 | + |
266 | 298 | @override |
267 | 299 | def to_numpy(self, |
268 | 300 | array: ArrayOrContainerOrScalar |
269 | 301 | ) -> NumpyOrContainerOrScalar: |
270 | 302 | def _to_numpy(ary): |
271 | | - return ary.get(queue=self.queue) |
| 303 | + if ary.flags.forc: |
| 304 | + # pyopencl supports host transfers only for contiguous arrays. |
| 305 | + return ary.get(queue=self.queue) |
| 306 | + |
| 307 | + result = self.call_loopy( |
| 308 | + self._get_to_numpy_noncontiguous_copy_kernel(ary.dtype, ary.ndim), |
| 309 | + input=ary, |
| 310 | + **{ |
| 311 | + f"s{i}": stride // ary.dtype.itemsize |
| 312 | + for i, stride in enumerate(ary.strides) |
| 313 | + }, |
| 314 | + )["output"] |
| 315 | + return result.get(queue=self.queue) |
272 | 316 |
|
273 | 317 | return with_array_context( |
274 | 318 | self._rec_map_container(_to_numpy, array), |
|
0 commit comments