diff --git a/flaml/default/estimator.py b/flaml/default/estimator.py index 5b46150f81..fcb318638e 100644 --- a/flaml/default/estimator.py +++ b/flaml/default/estimator.py @@ -95,6 +95,27 @@ def suggest_hyperparams(self, X, y): def fit(self, X, y, *args, **params): hyperparams, estimator_name, X, y_transformed = self.suggest_hyperparams(X, y) self.set_params(**hyperparams) + + # Transform eval_set if present + if "eval_set" in params and params["eval_set"] is not None: + transformed_eval_set = [] + for eval_X, eval_y in params["eval_set"]: + # Transform features + eval_X_transformed = self._feature_transformer.transform(eval_X) + # Transform labels if applicable + if self._label_transformer and estimator_name in [ + "rf", + "extra_tree", + "xgboost", + "xgb_limitdepth", + "choose_xgb", + ]: + eval_y_transformed = self._label_transformer.transform(eval_y) + transformed_eval_set.append((eval_X_transformed, eval_y_transformed)) + else: + transformed_eval_set.append((eval_X_transformed, eval_y)) + params["eval_set"] = transformed_eval_set + if self._label_transformer and estimator_name in [ "rf", "extra_tree", diff --git a/test/default/test_defaults.py b/test/default/test_defaults.py index acf50e4ea9..04f0fb70bb 100644 --- a/test/default/test_defaults.py +++ b/test/default/test_defaults.py @@ -183,6 +183,8 @@ def test_lgbm(): def test_xgboost(): + import numpy as np + from flaml.default import XGBClassifier, XGBRegressor X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True) @@ -200,6 +202,65 @@ def test_xgboost(): regressor.predict(X_train) print(regressor) + # Test eval_set with categorical features (Issue: eval_set not preprocessed) + np.random.seed(42) + n = 500 + df = pd.DataFrame( + { + "num1": np.random.randn(n), + "num2": np.random.rand(n) * 10, + "cat1": np.random.choice(["A", "B", "C"], size=n), + "cat2": np.random.choice(["X", "Y"], size=n), + "target": np.random.choice([0, 1], size=n), + } + ) + + X = df.drop(columns="target") + y = df["target"] + + X_train_cat, X_valid_cat, y_train_cat, y_valid_cat = train_test_split(X, y, test_size=0.2, random_state=0) + + # Convert categorical columns to pandas 'category' dtype + for col in X_train_cat.select_dtypes(include="object").columns: + X_train_cat[col] = X_train_cat[col].astype("category") + X_valid_cat[col] = X_valid_cat[col].astype("category") + + # Test XGBClassifier with eval_set + classifier_eval = XGBClassifier( + tree_method="hist", + enable_categorical=True, + eval_metric="logloss", + use_label_encoder=False, + early_stopping_rounds=10, + random_state=0, + n_estimators=10, + ) + classifier_eval.fit(X_train_cat, y_train_cat, eval_set=[(X_valid_cat, y_valid_cat)], verbose=False) + y_pred = classifier_eval.predict(X_valid_cat) + assert len(y_pred) == len(y_valid_cat) + + # Test XGBRegressor with eval_set + y_reg = df["num1"] # Use num1 as target for regression + X_reg = df.drop(columns=["num1", "target"]) + + X_train_reg, X_valid_reg, y_train_reg, y_valid_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0) + + for col in X_train_reg.select_dtypes(include="object").columns: + X_train_reg[col] = X_train_reg[col].astype("category") + X_valid_reg[col] = X_valid_reg[col].astype("category") + + regressor_eval = XGBRegressor( + tree_method="hist", + enable_categorical=True, + eval_metric="rmse", + early_stopping_rounds=10, + random_state=0, + n_estimators=10, + ) + regressor_eval.fit(X_train_reg, y_train_reg, eval_set=[(X_valid_reg, y_valid_reg)], verbose=False) + y_pred = regressor_eval.predict(X_valid_reg) + assert len(y_pred) == len(y_valid_reg) + def test_nobudget(): X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)