diff --git a/lightgbmlss/model.py b/lightgbmlss/model.py index a20cef7..e1fb055 100644 --- a/lightgbmlss/model.py +++ b/lightgbmlss/model.py @@ -14,9 +14,14 @@ from lightgbmlss.logger import CustomLogger lgb.register_logger(CustomLogger()) from lightgbmlss.utils import * +import optuna +from optuna.samplers import TPESampler +from optuna.integration import LightGBMPruningCallback +import shap from lightgbm.engine import CVBooster from lightgbm.basic import (Booster, Dataset) +from lightgbm.basic import LightGBMError from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold from lightgbm.compat import SKLEARN_INSTALLED, _LGBMGroupKFold, _LGBMStratifiedKFold @@ -240,8 +245,8 @@ def cv(self, num_boost_round=num_boost_round, folds=folds, nfold=nfold, - stratified=False, - shuffle=False, + stratified=stratified, + shuffle=shuffle, metrics=None, init_model=init_model, fpreproc=fpreproc, @@ -264,7 +269,8 @@ def hyper_opt( study_name=None, silence=False, seed=None, - hp_seed=None + hp_seed=None, + shuffle=False ): """ Function to tune hyperparameters using optuna. @@ -298,88 +304,81 @@ def hyper_opt( Seed used to generate the folds (passed to numpy.random.seed). hp_seed: int Seed for random number generator used in the Bayesian hyper-parameter search. + shuffle: bool + Whether to shuffle the data before splitting into folds. Set to False for time-series data + to preserve temporal order. Returns ------- opt_params : dict Optimal hyper-parameters. """ - from skbase.utils.dependencies import _check_soft_dependencies - - msg = ( - "LightGBMLSS.hyper_opt requires 'optuna' and 'optuna-integration' " - "to be installed. Please install the package to use this feature. " - "Installing via pip install lightgbmlss[all_extras] also installs " - "the required dependencies." - ) - _check_soft_dependencies(["optuna"], msg=msg) - - import optuna - from optuna.samplers import TPESampler - from optuna.integration import LightGBMPruningCallback - def objective(trial): - hyper_params = {} - for param_name, param_value in hp_dict.items(): - param_type = param_value[0] - if param_type == "categorical" or param_type == "none": - hyper_params.update({param_name: trial.suggest_categorical(param_name, param_value[1])}) - + if param_type in ["categorical", "none"]: + hyper_params[param_name] = trial.suggest_categorical(param_name, param_value[1]) elif param_type == "float": - param_constraints = param_value[1] - param_low = param_constraints["low"] - param_high = param_constraints["high"] - param_log = param_constraints["log"] - hyper_params.update( - {param_name: trial.suggest_float(param_name, - low=param_low, - high=param_high, - log=param_log - ) - }) - + constraints = param_value[1] + hyper_params[param_name] = trial.suggest_float( + param_name, low=constraints["low"], high=constraints["high"], log=constraints["log"] + ) elif param_type == "int": - param_constraints = param_value[1] - param_low = param_constraints["low"] - param_high = param_constraints["high"] - param_log = param_constraints["log"] - hyper_params.update( - {param_name: trial.suggest_int(param_name, - low=param_low, - high=param_high, - log=param_log - ) - }) - - # Add booster if not included in dictionary - if "boosting" not in hyper_params.keys(): - hyper_params.update({"boosting": trial.suggest_categorical("boosting", ["gbdt"])}) - - # Add pruning and early stopping - pruning_callback = LightGBMPruningCallback(trial, self.dist.loss_fn) - early_stopping_callback = lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False) - - lgblss_param_tuning = self.cv(hyper_params, - train_set, - num_boost_round=num_boost_round, - nfold=nfold, - callbacks=[pruning_callback, early_stopping_callback], - seed=seed, - ) - - # Extract the optimal number of boosting rounds - opt_rounds = np.argmin(np.array(lgblss_param_tuning[f"valid {self.dist.loss_fn}-mean"])) + 1 - trial.set_user_attr("opt_round", int(opt_rounds)) - - # Extract the best score - best_score = np.min(np.array(lgblss_param_tuning[f"valid {self.dist.loss_fn}-mean"])) - - return best_score + constraints = param_value[1] + hyper_params[param_name] = trial.suggest_int( + param_name, low=constraints["low"], high=constraints["high"], log=constraints["log"] + ) + else: + raise ValueError("Invalid parameter type.") + + if param_name == "clip_value": + self.dist.clip_value = hyper_params[param_name] + del hyper_params["clip_value"] + + # Add booster if not included + if "boosting" not in hyper_params: + hyper_params["boosting"] = trial.suggest_categorical("boosting", ["gbdt"]) + + try: + def cv_pruning_callback(env): + if env.evaluation_result_list is None: + return + + for evaluation_result in env.evaluation_result_list: + valid_name, metric_name, current_score, _ = evaluation_result[:4] + if metric_name not in (self.dist.loss_fn, f"valid {self.dist.loss_fn}"): + continue + if valid_name not in ("cv_agg", "valid", "valid_0"): + continue + + trial.report(current_score, step=env.iteration) + if trial.should_prune(): + raise optuna.exceptions.TrialPruned() + return + + early_stopping_callback = lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False) + + lgblss_param_tuning = self.cv( + hyper_params, + train_set, + num_boost_round=num_boost_round, + nfold=nfold, + callbacks=[cv_pruning_callback, early_stopping_callback], + seed=seed, + shuffle=shuffle, + ) + opt_rounds = np.argmin(np.array(lgblss_param_tuning[f"valid {self.dist.loss_fn}-mean"])) + 1 + trial.set_user_attr("opt_round", int(opt_rounds)) + best_score = np.min(np.array(lgblss_param_tuning[f"valid {self.dist.loss_fn}-mean"])) + return best_score + + except (LightGBMError, ValueError) as e: + print(f"Trial pruned due to LightGBMError or ValueError: {e}") + raise optuna.exceptions.TrialPruned() + if study_name is None: study_name = "LightGBMLSS Hyper-Parameter Optimization" @@ -398,12 +397,19 @@ def objective(trial): print("\nHyper-Parameter Optimization successfully finished.") print(" Number of finished trials: ", len(study.trials)) print(" Best trial:") + completed_trials = [ + trial for trial in study.trials if trial.state == optuna.trial.TrialState.COMPLETE + ] + if not completed_trials: + print(" No completed trials.") + return {} + opt_param = study.best_trial # Add optimal stopping round - opt_param.params["opt_rounds"] = study.trials_dataframe()["user_attrs_opt_round"][ - study.trials_dataframe()["value"].idxmin()] - opt_param.params["opt_rounds"] = int(opt_param.params["opt_rounds"]) + trials_df = study.trials_dataframe().dropna(subset=["value"]) + opt_rounds = trials_df.loc[trials_df["value"].idxmin(), "user_attrs_opt_round"] + opt_param.params["opt_rounds"] = int(opt_rounds) print(" Value: {}".format(opt_param.value)) print(" Params: ") diff --git a/tests/test_model/test_model.py b/tests/test_model/test_model.py index d33625f..8afa1a0 100644 --- a/tests/test_model/test_model.py +++ b/tests/test_model/test_model.py @@ -144,6 +144,25 @@ def test_model_hpo(self, univariate_data, univariate_lgblss,): # Assertions assert isinstance(opt_param, dict) + def test_model_cv_shuffle_respected(self, univariate_data, univariate_lgblss, monkeypatch): + # Unpack + dtrain, _, _, _ = univariate_data + lgblss = univariate_lgblss + + captured = [] + + def fake_cv(params, train_set, **kwargs): + captured.append(kwargs.get("shuffle")) + return {} + + monkeypatch.setattr(lgb, "cv", fake_cv) + + lgblss.cv({}, dtrain, shuffle=True, nfold=2) + lgblss.cv({}, dtrain, shuffle=False, nfold=2) + + # Assertions + assert captured == [True, False] + def test_model_predict(self, univariate_data, univariate_lgblss, univariate_params): # Unpack dtrain, _, _, X_test = univariate_data