Expand .df Array/Query accessor to allow indexing with NumPy and PyArrow arrays (#2170)

kounelisagis · web-flow · commit 2d6cd808993b · 2025-03-06T11:44:15.000+02:00
diff --git a/tiledb/array.py b/tiledb/array.py
@@ -1039,9 +1039,9 @@ def multi_index(self):
         """Retrieve data cells with multi-range, domain-inclusive indexing. Returns
         the cross-product of the ranges.
 
-        :param list selection: Per dimension, a scalar, ``slice``, or list of scalars
-            or ``slice`` objects. Scalars and ``slice`` components should match the
-            type of the underlying Dimension.
+        :param list selection: Per dimension, a scalar, ``slice``,
+            or a list/numpy array/pyarrow array of scalars or ``slice`` objects.
+            Scalars and ``slice`` components should match the type of the underlying Dimension.
         :returns: dict of {'attribute': result}. Coords are included by default for
             Sparse arrays only (use `Array.query(coords=<>)` to select).
         :raises IndexError: invalid or unsupported index selection
@@ -1093,9 +1093,9 @@ def df(self):
         """Retrieve data cells as a Pandas dataframe, with multi-range,
         domain-inclusive indexing using ``multi_index``.
 
-        :param list selection: Per dimension, a scalar, ``slice``, or list of scalars
-            or ``slice`` objects. Scalars and ``slice`` components should match the
-            type of the underlying Dimension.
+        :param list selection: Per dimension, a scalar, ``slice``,
+            or a list/numpy array/pyarrow array of scalars or ``slice`` objects.
+            Scalars and ``slice`` components should match the type of the underlying Dimension.
         :returns: dict of {'attribute': result}. Coords are included by default for
             Sparse arrays only (use `Array.query(coords=<>)` to select).
         :raises IndexError: invalid or unsupported index selection
diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py
@@ -40,8 +40,14 @@
     # We don't want to import these eagerly since importing Pandas in particular
     # can add around half a second of import time even if we never use it.
     import pandas
+
+
+try:
     import pyarrow
 
+    has_pyarrow = True
+except ImportError:
+    has_pyarrow = False
 
 current_timer: ContextVar[str] = ContextVar("timer_scope")
 
@@ -112,11 +118,15 @@ def to_scalar(obj: Any) -> Scalar:
         return cast(Scalar, obj)
     if isinstance(obj, np.ndarray) and obj.ndim == 0:
         return cast(Scalar, obj[()])
+    if has_pyarrow and isinstance(obj, pyarrow.Array):
+        return to_scalar(obj.to_numpy()[()])
+    if has_pyarrow and isinstance(obj, pyarrow.Scalar):
+        return cast(Scalar, obj.as_py())
     raise ValueError(f"Cannot convert {type(obj)} to scalar")
 
 
 def iter_ranges(
-    sel: Union[Scalar, slice, Range, List[Scalar]],
+    sel: Union[Scalar, slice, Range, List[Scalar], np.ndarray, "pyarrow.Array"],
     sparse: bool,
     nonempty_domain: Optional[Range] = None,
 ) -> Iterator[Range]:
@@ -145,7 +155,9 @@ def iter_ranges(
         assert len(sel) == 2
         yield to_scalar(sel[0]), to_scalar(sel[1])
 
-    elif isinstance(sel, list):
+    elif isinstance(sel, (list, np.ndarray)) or (
+        has_pyarrow and isinstance(sel, pyarrow.Array)
+    ):
         for scalar in map(to_scalar, sel):
             yield scalar, scalar
 
@@ -178,8 +190,6 @@ def iter_label_range(sel: Union[Scalar, slice, Range, List[Scalar]]):
 
 def dim_ranges_from_selection(selection, nonempty_domain, is_sparse):
     # don't try to index nonempty_domain if None
-    if isinstance(selection, np.ndarray):
-        return selection
     selection = selection if isinstance(selection, list) else [selection]
     return tuple(
         rng for sel in selection for rng in iter_ranges(sel, is_sparse, nonempty_domain)
diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py
@@ -1276,6 +1276,56 @@ def try_rt(name, df, pq_args={}):
         basic3 = make_dataframe_basic3()
         try_rt("basic3", basic3)
 
+    @pytest.mark.parametrize(
+        "dim_data, attr_data, dtype, domain",
+        [
+            (pyarrow.array([1, 2, 3]), pyarrow.array([1, 2, 3]), np.int64, (1, 3)),
+            (pyarrow.array(["a", "b", "c"]), pyarrow.array([1, 2, 3]), "ascii", None),
+        ],
+    )
+    def test_read_indexing_with_pyarrow_and_numpy_arrays(
+        self, dim_data, attr_data, dtype, domain
+    ):
+        # This test is to ensure that .df can be indexed with both PyArrow and NumPy arrays.
+        uri = self.path("read_indexing_with_pyarrow_and_numpy_arrays")
+
+        dim = (
+            tiledb.Dim(name="dim_a", dtype=dtype, domain=domain)
+            if domain
+            else tiledb.Dim(name="dim_a", dtype=dtype)
+        )
+        schema = tiledb.ArraySchema(
+            domain=tiledb.Domain(dim),
+            sparse=True,
+            attrs=[tiledb.Attr(name="rand", dtype=np.int32)],
+            allows_duplicates=True,
+        )
+        tiledb.Array.create(uri, schema)
+
+        with tiledb.open(uri, "w") as arr:
+            arr[dim_data] = attr_data
+
+        with tiledb.open(uri, "r") as arr:
+            expected_df = pd.DataFrame(
+                {"dim_a": dim_data.tolist(), "rand": attr_data.tolist()}
+            )
+
+            assert_array_equal(arr.df[:], expected_df)
+            assert_array_equal(arr.df[pyarrow.array(dim_data)], expected_df)
+            assert_array_equal(arr.df[np.array(dim_data)], expected_df)
+
+            partial_dim_data = dim_data[:2]
+            expected_partial_df = expected_df.iloc[:2]
+
+            assert_array_equal(
+                arr.df[pyarrow.array(partial_dim_data)], expected_partial_df
+            )
+            assert_array_equal(arr.df[np.array(partial_dim_data)], expected_partial_df)
+
+            expected_dict = OrderedDict(
+                [("dim_a", dim_data.tolist()), ("rand", attr_data.tolist())]
+            )
+
     def test_nullable_integers(self):
         nullable_int_dtypes = (
             pd.Int64Dtype(),