Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
381 changes: 371 additions & 10 deletions flaml/automl/automl.py

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion flaml/automl/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def __init__(self, task="binary", **config):
self._task = task if isinstance(task, Task) else task_factory(task, None, None)
self.params = self.config2params(config)
self.estimator_class = self._model = None
self.estimator_baseclass = "sklearn"
if "_estimator_type" in self.params:
self._estimator_type = self.params.pop("_estimator_type")
else:
Expand Down Expand Up @@ -439,6 +440,7 @@ def __init__(self, task="binary", **config):
raise SPARK_ERROR
super().__init__(task, **config)
self.df_train = None
self.estimator_baseclass = "spark"

def _preprocess(
self,
Expand Down Expand Up @@ -974,7 +976,7 @@ def _tokenize_text(self, X, y=None, **kwargs):
from .nlp.huggingface.utils import tokenize_text
from .nlp.utils import is_a_list_of_str

is_str = str(X.dtypes[0]) in ("string", "str")
is_str = str(X.dtypes.iloc[0]) in ("string", "str")
is_list_of_str = is_a_list_of_str(X[list(X.keys())[0]].to_list()[0])

if is_str or is_list_of_str:
Expand Down
2 changes: 1 addition & 1 deletion flaml/automl/nlp/huggingface/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from flaml.automl.task.task import NLG_TASKS

try:
from transformers import TrainingArguments
from transformers import Seq2SeqTrainingArguments as TrainingArguments
except ImportError:
TrainingArguments = object

Expand Down
2 changes: 1 addition & 1 deletion flaml/automl/nlp/huggingface/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def get_this_model(checkpoint_path, task, model_config):

if task in (SEQCLASSIFICATION, SEQREGRESSION):
return AutoModelForSequenceClassification.from_pretrained(
checkpoint_path, config=model_config, ignore_mismatched_sizes=True
checkpoint_path, config=model_config, ignore_mismatched_sizes=True, trust_remote_code=True
)
elif task == TOKENCLASSIFICATION:
return AutoModelForTokenClassification.from_pretrained(checkpoint_path, config=model_config)
Expand Down
2 changes: 1 addition & 1 deletion flaml/automl/task/time_series_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def validate_data(
raise ValueError("Must supply either X_train_all and y_train_all, or dataframe and label")

try:
dataframe[self.time_col] = pd.to_datetime(dataframe[self.time_col])
dataframe.loc[:, self.time_col] = pd.to_datetime(dataframe[self.time_col])
except Exception:
raise ValueError(
f"For '{TS_FORECAST}' task, time column {self.time_col} must contain timestamp values."
Expand Down
11 changes: 10 additions & 1 deletion flaml/automl/time_series/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ def __init__(
self.pca = None

def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
if "is_retrain" in kwargs:
kwargs.pop("is_retrain")
self._X = X
self._y = y

Expand All @@ -92,7 +94,14 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):

for i, model in enumerate(self.models):
offset = i + self.lags
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
if len(X) - offset > 2:
# series with length 2 will meet All features are either constant or ignored.
# TODO: see why the non-constant features are ignored. Selector?
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
elif len(X) > offset and "catboost" not in str(model).lower():
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
else:
print("[INFO]: Length of data should longer than period + lags.")
return self

def predict(self, X, X_train=None, y_train=None):
Expand Down
9 changes: 7 additions & 2 deletions flaml/automl/time_series/ts_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,12 @@ def X_val(self) -> pd.DataFrame:

@property
def X_all(self) -> pd.DataFrame:
return pd.concat([self.X_train, self.X_val], axis=0)
# Remove empty or all-NA columns before concatenation
X_train_filtered = self.X_train.dropna(axis=1, how="all")
X_val_filtered = self.X_val.dropna(axis=1, how="all")

# Concatenate the filtered DataFrames
return pd.concat([X_train_filtered, X_val_filtered], axis=0)

@property
def y_train(self) -> pd.DataFrame:
Expand Down Expand Up @@ -472,7 +477,7 @@ def transform(self, X: Union[DataFrame, np.array], y=None):
if "__NAN__" not in X[col].cat.categories:
X[col] = X[col].cat.add_categories("__NAN__").fillna("__NAN__")
else:
X[col] = X[col].fillna("__NAN__")
X[col] = X[col].fillna("__NAN__").infer_objects(copy=False)
X[col] = X[col].astype("category")

for column in self.num_columns:
Expand Down
14 changes: 13 additions & 1 deletion test/automl/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_sparse_matrix_regression(self):
)
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings)

def test_parallel(self, hpo_method=None):
def test_parallel_and_pickle(self, hpo_method=None):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
Expand All @@ -153,6 +153,18 @@ def test_parallel(self, hpo_method=None):
except ImportError:
return

# test pickle and load_pickle, should work for prediction
automl_experiment.pickle("automl_xgboost_spark.pkl")
automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
assert automl_loaded.best_estimator == automl_experiment.best_estimator
assert automl_loaded.best_loss == automl_experiment.best_loss
automl_loaded.predict(X_train)

import shutil

shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)

def test_sparse_matrix_regression_holdout(self):
X_train = scipy.sparse.random(8, 100)
y_train = np.random.uniform(size=8)
Expand Down
29 changes: 21 additions & 8 deletions test/spark/test_0sparkml.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_spark_synapseml_rank():
_test_spark_synapseml_lightgbm(spark, "rank")


def test_spark_input_df():
def test_spark_input_df_and_pickle():
import pandas as pd

file_url = "https://mmlspark.blob.core.windows.net/publicwasb/company_bankruptcy_prediction_data.csv"
Expand Down Expand Up @@ -201,6 +201,19 @@ def test_spark_input_df():
**settings,
)

# test pickle and load_pickle, should work for prediction
automl.pickle("automl_spark.pkl")
automl_loaded = AutoML().load_pickle("automl_spark.pkl")
assert automl_loaded.best_estimator == automl.best_estimator
assert automl_loaded.best_loss == automl.best_loss
automl_loaded.predict(df)
automl_loaded.model.estimator.transform(test_data)

import shutil

shutil.rmtree("automl_spark.pkl", ignore_errors=True)
shutil.rmtree("automl_spark.pkl.flaml_artifacts", ignore_errors=True)

if estimator_list == ["rf_spark"]:
return

Expand Down Expand Up @@ -393,13 +406,13 @@ def test_auto_convert_dtypes_spark():


if __name__ == "__main__":
test_spark_synapseml_classification()
test_spark_synapseml_regression()
test_spark_synapseml_rank()
test_spark_input_df()
test_get_random_dataframe()
test_auto_convert_dtypes_pandas()
test_auto_convert_dtypes_spark()
# test_spark_synapseml_classification()
# test_spark_synapseml_regression()
# test_spark_synapseml_rank()
test_spark_input_df_and_pickle()
# test_get_random_dataframe()
# test_auto_convert_dtypes_pandas()
# test_auto_convert_dtypes_spark()

# import cProfile
# import pstats
Expand Down
34 changes: 23 additions & 11 deletions test/spark/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]


def test_parallel_xgboost(hpo_method=None, data_size=1000):
def test_parallel_xgboost_and_pickle(hpo_method=None, data_size=1000):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
"time_budget": 30,
"metric": "ap",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
Expand All @@ -53,15 +53,27 @@ def test_parallel_xgboost(hpo_method=None, data_size=1000):
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)

# test pickle and load_pickle, should work for prediction
automl_experiment.pickle("automl_xgboost_spark.pkl")
automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
assert automl_loaded.best_estimator == automl_experiment.best_estimator
assert automl_loaded.best_loss == automl_experiment.best_loss
automl_loaded.predict(X_train)

import shutil

shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)


def test_parallel_xgboost_others():
# use random search as the hpo_method
test_parallel_xgboost(hpo_method="random")
test_parallel_xgboost_and_pickle(hpo_method="random")


@pytest.mark.skip(reason="currently not supporting too large data, will support spark dataframe in the future")
def test_large_dataset():
test_parallel_xgboost(data_size=90000000)
test_parallel_xgboost_and_pickle(data_size=90000000)


@pytest.mark.skipif(
Expand Down Expand Up @@ -95,10 +107,10 @@ def test_custom_learner(data_size=1000):


if __name__ == "__main__":
test_parallel_xgboost()
test_parallel_xgboost_others()
# test_large_dataset()
if skip_my_learner:
print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
else:
test_custom_learner()
test_parallel_xgboost_and_pickle()
# test_parallel_xgboost_others()
# # test_large_dataset()
# if skip_my_learner:
# print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
# else:
# test_custom_learner()
Loading