Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 80 additions & 74 deletions lightgbmlss/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@
from lightgbmlss.logger import CustomLogger
lgb.register_logger(CustomLogger())
from lightgbmlss.utils import *
import optuna
from optuna.samplers import TPESampler
from optuna.integration import LightGBMPruningCallback
import shap

from lightgbm.engine import CVBooster
from lightgbm.basic import (Booster, Dataset)
from lightgbm.basic import LightGBMError

from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
from lightgbm.compat import SKLEARN_INSTALLED, _LGBMGroupKFold, _LGBMStratifiedKFold
Expand Down Expand Up @@ -240,8 +245,8 @@ def cv(self,
num_boost_round=num_boost_round,
folds=folds,
nfold=nfold,
stratified=False,
shuffle=False,
stratified=stratified,
shuffle=shuffle,
metrics=None,
init_model=init_model,
fpreproc=fpreproc,
Expand All @@ -264,7 +269,8 @@ def hyper_opt(
study_name=None,
silence=False,
seed=None,
hp_seed=None
hp_seed=None,
shuffle=False
):
"""
Function to tune hyperparameters using optuna.
Expand Down Expand Up @@ -298,88 +304,81 @@ def hyper_opt(
Seed used to generate the folds (passed to numpy.random.seed).
hp_seed: int
Seed for random number generator used in the Bayesian hyper-parameter search.
shuffle: bool
Whether to shuffle the data before splitting into folds. Set to False for time-series data
to preserve temporal order.

Returns
-------
opt_params : dict
Optimal hyper-parameters.
"""
from skbase.utils.dependencies import _check_soft_dependencies

msg = (
"LightGBMLSS.hyper_opt requires 'optuna' and 'optuna-integration' "
"to be installed. Please install the package to use this feature. "
"Installing via pip install lightgbmlss[all_extras] also installs "
"the required dependencies."
)
_check_soft_dependencies(["optuna"], msg=msg)

import optuna
from optuna.samplers import TPESampler
from optuna.integration import LightGBMPruningCallback

def objective(trial):

hyper_params = {}

for param_name, param_value in hp_dict.items():

param_type = param_value[0]

if param_type == "categorical" or param_type == "none":
hyper_params.update({param_name: trial.suggest_categorical(param_name, param_value[1])})

if param_type in ["categorical", "none"]:
hyper_params[param_name] = trial.suggest_categorical(param_name, param_value[1])
elif param_type == "float":
param_constraints = param_value[1]
param_low = param_constraints["low"]
param_high = param_constraints["high"]
param_log = param_constraints["log"]
hyper_params.update(
{param_name: trial.suggest_float(param_name,
low=param_low,
high=param_high,
log=param_log
)
})

constraints = param_value[1]
hyper_params[param_name] = trial.suggest_float(
param_name, low=constraints["low"], high=constraints["high"], log=constraints["log"]
)
elif param_type == "int":
param_constraints = param_value[1]
param_low = param_constraints["low"]
param_high = param_constraints["high"]
param_log = param_constraints["log"]
hyper_params.update(
{param_name: trial.suggest_int(param_name,
low=param_low,
high=param_high,
log=param_log
)
})

# Add booster if not included in dictionary
if "boosting" not in hyper_params.keys():
hyper_params.update({"boosting": trial.suggest_categorical("boosting", ["gbdt"])})

# Add pruning and early stopping
pruning_callback = LightGBMPruningCallback(trial, self.dist.loss_fn)
early_stopping_callback = lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False)

lgblss_param_tuning = self.cv(hyper_params,
train_set,
num_boost_round=num_boost_round,
nfold=nfold,
callbacks=[pruning_callback, early_stopping_callback],
seed=seed,
)

# Extract the optimal number of boosting rounds
opt_rounds = np.argmin(np.array(lgblss_param_tuning[f"valid {self.dist.loss_fn}-mean"])) + 1
trial.set_user_attr("opt_round", int(opt_rounds))

# Extract the best score
best_score = np.min(np.array(lgblss_param_tuning[f"valid {self.dist.loss_fn}-mean"]))

return best_score
constraints = param_value[1]
hyper_params[param_name] = trial.suggest_int(
param_name, low=constraints["low"], high=constraints["high"], log=constraints["log"]
)
else:
raise ValueError("Invalid parameter type.")

if param_name == "clip_value":
self.dist.clip_value = hyper_params[param_name]
del hyper_params["clip_value"]

# Add booster if not included
if "boosting" not in hyper_params:
hyper_params["boosting"] = trial.suggest_categorical("boosting", ["gbdt"])

try:
def cv_pruning_callback(env):
if env.evaluation_result_list is None:
return

for evaluation_result in env.evaluation_result_list:
valid_name, metric_name, current_score, _ = evaluation_result[:4]
if metric_name not in (self.dist.loss_fn, f"valid {self.dist.loss_fn}"):
continue
if valid_name not in ("cv_agg", "valid", "valid_0"):
continue

trial.report(current_score, step=env.iteration)
if trial.should_prune():
raise optuna.exceptions.TrialPruned()
return

early_stopping_callback = lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False)

lgblss_param_tuning = self.cv(
hyper_params,
train_set,
num_boost_round=num_boost_round,
nfold=nfold,
callbacks=[cv_pruning_callback, early_stopping_callback],
seed=seed,
shuffle=shuffle,
)

opt_rounds = np.argmin(np.array(lgblss_param_tuning[f"valid {self.dist.loss_fn}-mean"])) + 1
trial.set_user_attr("opt_round", int(opt_rounds))
best_score = np.min(np.array(lgblss_param_tuning[f"valid {self.dist.loss_fn}-mean"]))
return best_score

except (LightGBMError, ValueError) as e:
print(f"Trial pruned due to LightGBMError or ValueError: {e}")
raise optuna.exceptions.TrialPruned()

if study_name is None:
study_name = "LightGBMLSS Hyper-Parameter Optimization"

Expand All @@ -398,12 +397,19 @@ def objective(trial):
print("\nHyper-Parameter Optimization successfully finished.")
print(" Number of finished trials: ", len(study.trials))
print(" Best trial:")
completed_trials = [
trial for trial in study.trials if trial.state == optuna.trial.TrialState.COMPLETE
]
if not completed_trials:
print(" No completed trials.")
return {}

opt_param = study.best_trial

# Add optimal stopping round
opt_param.params["opt_rounds"] = study.trials_dataframe()["user_attrs_opt_round"][
study.trials_dataframe()["value"].idxmin()]
opt_param.params["opt_rounds"] = int(opt_param.params["opt_rounds"])
trials_df = study.trials_dataframe().dropna(subset=["value"])
opt_rounds = trials_df.loc[trials_df["value"].idxmin(), "user_attrs_opt_round"]
opt_param.params["opt_rounds"] = int(opt_rounds)

print(" Value: {}".format(opt_param.value))
print(" Params: ")
Expand Down
19 changes: 19 additions & 0 deletions tests/test_model/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,25 @@ def test_model_hpo(self, univariate_data, univariate_lgblss,):
# Assertions
assert isinstance(opt_param, dict)

def test_model_cv_shuffle_respected(self, univariate_data, univariate_lgblss, monkeypatch):
# Unpack
dtrain, _, _, _ = univariate_data
lgblss = univariate_lgblss

captured = []

def fake_cv(params, train_set, **kwargs):
captured.append(kwargs.get("shuffle"))
return {}

monkeypatch.setattr(lgb, "cv", fake_cv)

lgblss.cv({}, dtrain, shuffle=True, nfold=2)
lgblss.cv({}, dtrain, shuffle=False, nfold=2)

# Assertions
assert captured == [True, False]

def test_model_predict(self, univariate_data, univariate_lgblss, univariate_params):
# Unpack
dtrain, _, _, X_test = univariate_data
Expand Down