Move ti.sum()/prod() to dpctl_ext.tensor and reuse them in dpnp

vlad-perevezentsev · vlad-perevezentsev · commit 6b27b1bd1ca5 · 2026-03-03T05:13:30.000-08:00
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
@@ -88,6 +88,8 @@
     argmin,
     max,
     min,
+    prod,
+    sum,
 )
 from ._searchsorted import searchsorted
 from ._set_functions import (
@@ -143,6 +145,7 @@
     "ones",
     "ones_like",
     "place",
+    "prod",
     "put",
     "put_along_axis",
     "repeat",
@@ -153,6 +156,7 @@
     "sort",
     "squeeze",
     "stack",
+    "sum",
     "swapaxes",
     "take",
     "take_along_axis",
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
@@ -37,6 +37,10 @@
 import dpctl_ext.tensor._tensor_reductions_impl as tri
 
 from ._numpy_helper import normalize_axis_tuple
+from ._type_utils import (
+    _default_accumulation_dtype,
+    _to_device_supported_dtype,
+)
 
 
 def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
@@ -137,6 +141,164 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
     return out
 
 
+def _reduction_over_axis(
+    x,
+    axis,
+    dtype,
+    keepdims,
+    out,
+    _reduction_fn,
+    _dtype_supported,
+    _default_reduction_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        arr = x
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        arr = dpt_ext.permute_dims(x, perm)
+    red_nd = len(axis)
+    res_shape = arr.shape[: nd - red_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    if dtype is None:
+        res_dt = _default_reduction_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    res_usm_type = x.usm_type
+
+    implemented_types = _dtype_supported(inp_dt, res_dt, res_usm_type, q)
+    if dtype is None and not implemented_types:
+        raise RuntimeError(
+            "Automatically determined reduction data type does not "
+            "have direct implementation"
+        )
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt_ext.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out) and implemented_types:
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+
+    _manager = SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=out, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+            out = orig_out
+        return out
+
+    if implemented_types:
+        ht_e, red_e = _reduction_fn(
+            src=arr,
+            trailing_dims_to_reduce=red_nd,
+            dst=out,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_e, red_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[red_e]
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            out = orig_out
+    else:
+        if _dtype_supported(res_dt, res_dt, res_usm_type, q):
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            ht_e_red, red_ev = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=out,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            _manager.add_event_pair(ht_e_red, red_ev)
+        else:
+            buf_dt = _default_reduction_type_fn(inp_dt, q)
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            tmp_res = dpt_ext.empty(
+                res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_red, r_e = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=tmp_res,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            _manager.add_event_pair(ht_e_red, r_e)
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=out, sycl_queue=q, depends=[r_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+    return out
+
+
 def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
@@ -374,3 +536,132 @@ def min(x, /, *, axis=None, keepdims=False, out=None):
             array has the same data type as ``x``.
     """
     return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis)
+
+
+def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the product of elements in the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which products must be computed. If a tuple
+            of unique integers, products are computed over multiple axes.
+            If ``None``, the product is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real- or complex-valued floating-point data
+              type, the returned array will have the same data type as
+              ``x``.
+            * If ``x`` has signed integral data type, the returned array
+              will have the default signed integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has unsigned integral data type, the returned array
+              will have the default unsigned integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a boolean data type, the returned array will
+              have the default signed integral type for the device
+              where input array ``x`` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the product.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the products. If the product was computed over
+            the entire array, a zero-dimensional array is returned. The
+            returned array has the data type as described in the ``dtype``
+            parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._prod_over_axis,
+        tri._prod_over_axis_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the sum of elements in the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which sums must be computed. If a tuple
+            of unique integers, sums are computed over multiple axes.
+            If ``None``, the sum is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real- or complex-valued floating-point data
+              type, the returned array will have the same data type as
+              ``x``.
+            * If ``x`` has signed integral data type, the returned array
+              will have the default signed integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has unsigned integral data type, the returned array
+              will have the default unsigned integral type for the device
+              where input array ``x`` is allocated.
+              array ``x`` is allocated.
+            * If ``x`` has a boolean data type, the returned array will
+              have the default signed integral type for the device
+              where input array ``x`` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the sum.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the sums. If the sum was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the ``dtype`` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._sum_over_axis,
+        tri._sum_over_axis_dtype_supported,
+        _default_accumulation_dtype,
+    )
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
@@ -428,7 +428,9 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to put a count of all NaNs
             # at the last index
-            dpt.sum(usm_res.counts[first_nan:], out=usm_res.counts[first_nan])
+            dpt_ext.sum(
+                usm_res.counts[first_nan:], out=usm_res.counts[first_nan]
+            )
             result += (usm_res.counts[: first_nan + 1],)
         else:
             result += (usm_res.counts,)
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py