diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 20d9cf313f..a23f780ab3 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -31,7 +31,6 @@ from sklearnex.neighbors.knn_unsupervised import NearestNeighbors from ..utils._array_api import enable_array_api, get_namespace -from ..utils.validation import validate_data @enable_array_api @@ -141,6 +140,10 @@ def fit(self, X, y=None): self._fit_X = xp.asarray(self._fit_X, device=device) return self + # Note: this is overriding an internal method from scikit-learn with + # the same signature. In this case, 'validate_data' is called during + # 'decision_function', which calls '.kneighbors()'. Hence, it doesn't + # need to validate the namespace of 'X' with '_fit_X' here. def _predict(self, X=None): check_is_fitted(self) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 02cc8f8968..221f49a674 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -29,6 +29,7 @@ if sklearn_check_version("1.9"): from sklearn.utils._sparse import _align_api_if_sparse + from sklearn.utils._array_api import get_namespace_and_device, move_to from onedal._device_offload import _transfer_to_host from onedal.utils._array_api import _is_numpy_namespace @@ -37,7 +38,6 @@ from .._utils import PatchingConditionsChain from ..base import oneDALEstimator from ..utils._array_api import get_namespace -from ..utils.validation import validate_data class KNeighborsDispatchingBase(oneDALEstimator): @@ -51,11 +51,20 @@ def _get_weights(self, dist, weights): # if user attempts to classify a point that was zero distance from one # or more training points, those training points are weighted as 1.0 # and the other points as 0.0 - with xp.errstate(divide="ignore"): - dist = 1.0 / dist + if _is_numpy_namespace(xp): + with xp.errstate(divide="ignore"): + dist = 1.0 / dist + else: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + dist = 1.0 / dist inf_mask = xp.isinf(dist) inf_row = xp.any(inf_mask, axis=1) - dist[inf_row] = inf_mask[inf_row] + if _is_numpy_namespace(xp): + # Note: older numpy do not have 'np.astype' + dist[inf_row] = inf_mask[inf_row] + else: + dist[inf_row] = xp.astype(inf_mask[inf_row], dist.dtype) return dist elif callable(weights): return weights(dist) @@ -84,11 +93,19 @@ def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_t array-like Predicted values. """ - xp, _ = get_namespace(y_train) - if not _is_numpy_namespace(xp): + # Note: in theory, the logic should be that 'y_train' should be converted + # to the namespace of 'neigh_dist', but by this point, 'y_train' should + # already have been moved to X's namespace, so it's fine to move 'neigh_dist'. + if sklearn_check_version("1.9"): + xp, _, device = get_namespace_and_device(y_train) + neigh_dist = move_to(neigh_dist, xp=xp, device=device) + neigh_ind = move_to(neigh_ind, xp=xp, device=device) + else: + xp, _ = get_namespace(y_train) device = getattr(y_train, "device", None) - neigh_dist = xp.asarray(neigh_dist, device=device) - neigh_ind = xp.asarray(neigh_ind, device=device) + if not _is_numpy_namespace(xp): + neigh_dist = xp.asarray(neigh_dist, device=device) + neigh_ind = xp.asarray(neigh_ind, device=device) weights = self._get_weights(neigh_dist, weights_param) @@ -113,9 +130,7 @@ def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_t y_pred_shape = (neigh_ind.shape[0], _y.shape[1]) if not _is_numpy_namespace(xp): # Array API: pass device to ensure same device as input - y_pred = xp.empty( - y_pred_shape, dtype=neigh_dist.dtype, device=neigh_ind.device - ) + y_pred = xp.empty(y_pred_shape, dtype=neigh_dist.dtype, device=device) else: # Numpy: no device parameter y_pred = xp.empty(y_pred_shape, dtype=neigh_dist.dtype) @@ -164,11 +179,16 @@ def _compute_class_probabilities( array-like Class probabilities. """ - xp, _ = get_namespace(y_train) - if not _is_numpy_namespace(xp): + if sklearn_check_version("1.9"): + xp, _, device = get_namespace_and_device(y_train) + neigh_dist = move_to(neigh_dist, xp=xp, device=device) + neigh_ind = move_to(neigh_ind, xp=xp, device=device) + else: + xp, _ = get_namespace(y_train) device = getattr(y_train, "device", None) - neigh_dist = xp.asarray(neigh_dist, device=device) - neigh_ind = xp.asarray(neigh_ind, device=device) + if not _is_numpy_namespace(xp): + neigh_dist = xp.asarray(neigh_dist, device=device) + neigh_ind = xp.asarray(neigh_ind, device=device) _y = y_train classes_ = classes @@ -207,9 +227,9 @@ def _compute_class_probabilities( proba_k = xp.zeros( (n_classes, n_queries), dtype=neigh_dist.dtype, - device=neigh_dist.device, + device=device, ) - zero = xp.zeros(1, dtype=neigh_dist.dtype, device=neigh_dist.device) + zero = xp.zeros(1, dtype=neigh_dist.dtype, device=device) for c in range(n_classes): mask = pred_labels == c proba_k[c, :] = xp.sum(xp.where(mask, weights_k, zero), axis=1) @@ -654,6 +674,8 @@ def _onedal_gpu_supported(self, method_name, *data): def _onedal_cpu_supported(self, method_name, *data): return self._onedal_supported("cpu", method_name, *data) + # Note: since this transfers the data to host, it doesn't validate + # that the array namespaces and devices of 'X' and '_fit_X' match. def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): check_is_fitted(self) if n_neighbors is None: diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index e4383da4c5..9788722a20 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -33,6 +33,13 @@ from ..utils.validation import validate_data from .common import KNeighborsDispatchingBase +if sklearn_check_version("1.9"): + from sklearn.utils._array_api import ( + check_same_namespace, + get_namespace_and_device, + move_to, + ) + @enable_array_api @control_n_jobs( @@ -72,7 +79,12 @@ def __init__( ) def fit(self, X, y): - xp, is_array_api = get_namespace(X) + if sklearn_check_version("1.9"): + xp, is_array_api, device = get_namespace_and_device(X) + else: + xp, is_array_api = get_namespace(X) + device = getattr(X, "device", None) + dispatch( self, "fit", @@ -86,7 +98,6 @@ def fit(self, X, y): # Ensure _fit_X matches the input namespace so that # kneighbors(X=None) can use get_namespace(self._fit_X). if is_array_api and not _is_numpy_namespace(xp): - device = getattr(X, "device", None) self._fit_X = xp.asarray(self._fit_X, device=device) return self @@ -169,7 +180,7 @@ def _onedal_fit(self, X, y, queue=None): ) # Process classification targets before passing to onedal - self._process_classification_targets(y, skip_validation=False) + self._process_classification_targets(X, y, skip_validation=False) # Call onedal backend onedal_params = { @@ -200,7 +211,7 @@ def _onedal_fit(self, X, y, queue=None): # Post-processing self._save_attributes() - def _process_classification_targets(self, y, skip_validation=False): + def _process_classification_targets(self, X, y, skip_validation=False): """Process classification targets and set class-related attributes. Parameters @@ -246,6 +257,10 @@ def _process_classification_targets(self, y, skip_validation=False): self.classes_ = self.classes_[0] self._y = xp.reshape(self._y, (-1,)) + if sklearn_check_version("1.9"): + xp_X, _, device = get_namespace_and_device(X) + self._y = move_to(self._y, xp=xp_X, device=device) + def _onedal_predict(self, X, queue=None): if X is not None: xp, _ = get_namespace(X) @@ -256,14 +271,20 @@ def _onedal_predict(self, X, queue=None): accept_sparse="csr", reset=False, ) + if sklearn_check_version("1.9"): + check_same_namespace(X, self, attribute="_fit_X", method="predict") params = self._onedal_estimator._get_onedal_params(X) params["result_option"] = "responses" result = self._onedal_estimator._onedal_predict( self._onedal_estimator._onedal_model, X, params ) - xp, _ = get_namespace(X) responses = from_table(result.responses, like=X) + if sklearn_check_version("1.9"): + xp, _, device = get_namespace_and_device(self.classes_) + responses = move_to(responses, xp=xp, device=device) + else: + xp, _ = get_namespace(X) return xp.take( self.classes_, xp.asarray(xp.reshape(responses, (-1,)), dtype=xp.int64) ) @@ -278,6 +299,8 @@ def _onedal_predict_proba(self, X, queue=None): accept_sparse="csr", reset=False, ) + if sklearn_check_version("1.9"): + check_same_namespace(X, self, attribute="_fit_X", method="predict_proba") neigh_dist, neigh_ind = self._onedal_estimator.kneighbors(X) @@ -299,6 +322,8 @@ def _onedal_kneighbors( accept_sparse="csr", reset=False, ) + if sklearn_check_version("1.9"): + check_same_namespace(X, self, attribute="_fit_X", method="kneighbors") else: query_is_train = True X = self._fit_X diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 9b2cb58e62..64089c0121 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -31,6 +31,13 @@ from ..utils.validation import validate_data from .common import KNeighborsDispatchingBase +if sklearn_check_version("1.9"): + from sklearn.utils._array_api import ( + check_same_namespace, + get_namespace_and_device, + move_to, + ) + @enable_array_api("1.5") # validate_data y_numeric requires sklearn >=1.5 @control_n_jobs(decorated_methods=["fit", "predict", "kneighbors", "score"]) @@ -68,7 +75,11 @@ def __init__( ) def fit(self, X, y): - xp, is_array_api = get_namespace(X) + if sklearn_check_version("1.9"): + xp, is_array_api, device = get_namespace_and_device(X) + else: + xp, is_array_api = get_namespace(X) + device = getattr(X, "device", None) dispatch( self, "fit", @@ -82,7 +93,6 @@ def fit(self, X, y): # Ensure _fit_X matches the input namespace so that # kneighbors(X=None) can use get_namespace(self._fit_X). if is_array_api and not _is_numpy_namespace(xp): - device = getattr(X, "device", None) self._fit_X = xp.asarray(self._fit_X, device=device) return self @@ -138,7 +148,10 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): ) def _onedal_fit(self, X, y, queue=None): - xp, _ = get_namespace(X, y) + if sklearn_check_version("1.9"): + xp, _, device = get_namespace_and_device(X) + else: + xp, _ = get_namespace(X, y) self._set_effective_metric() X, y = validate_data( @@ -151,6 +164,9 @@ def _onedal_fit(self, X, y, queue=None): y_numeric=True, ) + if sklearn_check_version("1.9"): + y = move_to(y, xp=xp, device=device) + self._process_regression_targets(y) onedal_params = { "n_neighbors": self.n_neighbors, @@ -215,6 +231,13 @@ def _predict_gpu(self, X, queue=None): accept_sparse="csr", reset=False, ) + # Note: if called before 'validate_data', this check would fail if 'X' is + # a 'DataFrame', since '_fit_X' would have already been converted to NumPy. + # Hence, it must come after the call to 'validate_data'. If the behavior + # of this validator changes in scikit-learn, these checks could be done + # earlier in the code for quicker errors. + if sklearn_check_version("1.9"): + check_same_namespace(X, self, attribute="_fit_X", method="predict") result = self._onedal_estimator._predict_gpu(X) return result @@ -246,6 +269,8 @@ def _predict_skl(self, X, queue=None): X = validate_data( self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False ) + if sklearn_check_version("1.9"): + check_same_namespace(X, self, attribute="_fit_X", method="predict") return self._predict_skl_regression(X) def _onedal_kneighbors( @@ -262,6 +287,8 @@ def _onedal_kneighbors( accept_sparse="csr", reset=False, ) + if sklearn_check_version("1.9"): + check_same_namespace(X, self, attribute="_fit_X", method="kenighbors") else: query_is_train = True X = self._fit_X diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 19b48a81ef..4c06c471d5 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -29,6 +29,9 @@ from ..utils.validation import validate_data from .common import KNeighborsDispatchingBase +if sklearn_check_version("1.9"): + from sklearn.utils._array_api import check_same_namespace, get_namespace_and_device + @enable_array_api @control_n_jobs(decorated_methods=["fit", "kneighbors", "radius_neighbors"]) @@ -66,7 +69,12 @@ def __init__( ) def fit(self, X, y=None): - xp, is_array_api = get_namespace(X) + if sklearn_check_version("1.9"): + xp, is_array_api, device = get_namespace_and_device(X) + else: + xp, is_array_api = get_namespace(X) + device = getattr(X, "device", None) + dispatch( self, "fit", @@ -80,7 +88,6 @@ def fit(self, X, y=None): # Ensure _fit_X matches the input namespace so that # kneighbors(X=None) can use get_namespace(self._fit_X). if is_array_api and not _is_numpy_namespace(xp): - device = getattr(X, "device", None) self._fit_X = xp.asarray(self._fit_X, device=device) return self @@ -189,6 +196,8 @@ def _onedal_predict(self, X, queue=None): reset=False, force_all_finite=False, ) + if sklearn_check_version("1.9"): + check_same_namespace(X, self, attribute="_fit_X", method="predict") return self._onedal_estimator.predict(X, queue=queue) def _onedal_kneighbors( @@ -205,6 +214,8 @@ def _onedal_kneighbors( accept_sparse="csr", reset=False, ) + if sklearn_check_version("1.9"): + check_same_namespace(X, self, attribute="_fit_X", method="kneighbors") else: query_is_train = True X = self._fit_X diff --git a/sklearnex/neighbors/tests/test_neighbors.py b/sklearnex/neighbors/tests/test_neighbors.py index f1653f2291..0ba0989b13 100755 --- a/sklearnex/neighbors/tests/test_neighbors.py +++ b/sklearnex/neighbors/tests/test_neighbors.py @@ -14,16 +14,24 @@ # limitations under the License. # =============================================================================== +import array_api_strict import numpy as np +import pandas as pd import pytest from numpy.testing import assert_allclose, assert_array_equal from sklearn import datasets +from sklearn.base import is_regressor +from daal4py.sklearn._utils import sklearn_check_version from onedal.tests.utils._dataframes_support import ( _as_numpy, _convert_to_dataframe, + dpnp_available, get_dataframes_and_queues, + torch_available, + torch_xpu_available, ) +from onedal.tests.utils._device_selection import is_sycl_device_available from sklearnex.neighbors import ( KNeighborsClassifier, KNeighborsRegressor, @@ -31,6 +39,11 @@ NearestNeighbors, ) +if dpnp_available: + import dpnp +if torch_available: + import torch + @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) def test_sklearnex_import_knn_classifier(dataframe, queue): @@ -173,3 +186,146 @@ def test_p_present_if_metric_is_minkowski(): assert knn.effective_metric_ == "minkowski" assert "p" in knn.effective_metric_params_ assert knn.effective_metric_params_["p"] == 3 + + +# Note: this doesn't check 'kneighbors_graph', because that function +# transfers the data to NumPy internally, so it will not necessarily +# end up erroring out. +@pytest.mark.skipif( + not sklearn_check_version("1.9"), reason="Functionality introduced in alter versions." +) +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +def test_error_on_incompatible_namespaces(weights, with_array_api): + rng = np.random.default_rng(seed=123) + X = rng.standard_normal(size=(25, 3)) + y = rng.standard_normal(size=X.shape[0]) + Xa = array_api_strict.from_dlpack(X) + ya = array_api_strict.from_dlpack(y) + + knn = KNeighborsRegressor(weights=weights).fit(X, y) + + with pytest.raises(ValueError, match="same namespace"): + knn.predict(Xa) + with pytest.raises(ValueError, match="same namespace"): + knn.kneighbors(Xa) + + knn = KNeighborsRegressor().fit(Xa, ya) + with pytest.raises(ValueError, match="same namespace"): + knn.predict(X) + with pytest.raises(ValueError, match="same namespace"): + knn.kneighbors(X) + + +@pytest.mark.skipif( + not sklearn_check_version("1.9"), + reason="Functionality introduced in later scikit-learn versions.", +) +@pytest.mark.parametrize("X_xp", [np, pd, array_api_strict]) +@pytest.mark.parametrize("y_xp", [np, pd, array_api_strict]) +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +@pytest.mark.parametrize("n_classes", [0, 2, 3]) # 0 == regression +def test_mixed_array_namespaces(X_xp, y_xp, weights, n_classes, with_array_api): + rng = np.random.default_rng(seed=123) + X = rng.standard_normal(size=(50, 4)) + if n_classes == 0: # regressor + y = rng.standard_normal(size=X.shape[0]) + else: + y = rng.integers(n_classes, size=X.shape[0]) + + if X_xp is pd: + X = pd.DataFrame(X) + else: + X = X_xp.asarray(X) + if y_xp is pd: + if n_classes != 0: + y = np.array(["a", "b", "c"])[y] + y = pd.Series(y) + else: + y = y_xp.asarray(y) + + model = (KNeighborsClassifier if n_classes != 0 else KNeighborsRegressor)( + weights=weights + ) + model.fit(X, y) + pred = model.predict(X) + _ = model.score(X, y) + + _ = model.kneighbors(X) + _ = model.kneighbors_graph(X) + + if n_classes != 0: + proba = model.predict_proba(X) + if X_xp == pd: + assert isinstance(proba, np.ndarray) + else: + assert proba.__class__ == X.__class__ + + if n_classes == 0: + assert pred.__class__ == (X.__class__ if X_xp is not pd else np.ndarray) + else: + assert pred.__class__ == (y.__class__ if y_xp is not pd else np.ndarray) + + # Note: this is a quick check to ensure that the result has the same + # kind of values as the input. There's no particular justification + # behind requiring 25% classification accuracy. + if n_classes != 0: + if y_xp is pd: + y_xp = np + pred_is_correct = y_xp.astype(y_xp.asarray(pred == y), y_xp.float32) + assert y_xp.sum(pred_is_correct) >= (0.25 * int(X.shape[0])) + + +@pytest.mark.skipif( + not sklearn_check_version("1.9"), + reason="Functionality introduced in later scikit-learn versions.", +) +@pytest.mark.skipif( + not is_sycl_device_available("gpu"), reason="Test checks GPU-specific functionality." +) +@pytest.mark.parametrize( + "X_xp, X_device", + ([(torch, "xpu"), (torch, "cpu")] if torch_xpu_available else []) + + ([(dpnp, "gpu"), (dpnp, "cpu")] if dpnp_available else []), +) +@pytest.mark.parametrize( + "y_xp, y_device", + ([(torch, "xpu"), (torch, "cpu")] if torch_xpu_available else []) + + ([(dpnp, "gpu"), (dpnp, "cpu")] if dpnp_available else []) + + [(pd, None)], +) +@pytest.mark.parametrize( + "estimator", + [ + KNeighborsRegressor(algorithm="brute"), + KNeighborsClassifier(algorithm="brute"), + ], +) +def test_knn_mixed_devices(X_xp, y_xp, X_device, y_device, estimator, with_array_api): + rng = np.random.default_rng(seed=123) + X = rng.standard_normal(size=(50, 4)) + if is_regressor(estimator): + y = rng.standard_normal(size=X.shape[0]) + else: + y = rng.integers(2, size=X.shape[0]) + + X = X_xp.asarray(X, device=X_device) + if y_xp is pd: + if is_regressor(estimator): + y = pd.Series(y) + else: + y = pd.Series(np.array(["a", "b"])[y]) + else: + y = y_xp.asarray(y, device=y_device) + + estimator.fit(X, y) + pred = estimator.predict(X) + if is_regressor(estimator): + assert pred.__class__ == X.__class__ + else: + if y_xp is pd: + assert isinstance(pred, np.ndarray) + else: + assert pred.__class__ == y.__class__ + proba = estimator.predict_proba(X) + assert proba.__class__ == X.__class__ + _ = estimator.score(X, y)