Skip to content

Commit ced1d6f

Browse files
authored
Support pickling the whole AutoML instance, Sync Fabric till 0d4ab16f (#1481)
1 parent bb213e7 commit ced1d6f

10 files changed

Lines changed: 451 additions & 37 deletions

File tree

flaml/automl/automl.py

Lines changed: 371 additions & 10 deletions
Large diffs are not rendered by default.

flaml/automl/model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ def __init__(self, task="binary", **config):
135135
self._task = task if isinstance(task, Task) else task_factory(task, None, None)
136136
self.params = self.config2params(config)
137137
self.estimator_class = self._model = None
138+
self.estimator_baseclass = "sklearn"
138139
if "_estimator_type" in self.params:
139140
self._estimator_type = self.params.pop("_estimator_type")
140141
else:
@@ -439,6 +440,7 @@ def __init__(self, task="binary", **config):
439440
raise SPARK_ERROR
440441
super().__init__(task, **config)
441442
self.df_train = None
443+
self.estimator_baseclass = "spark"
442444

443445
def _preprocess(
444446
self,
@@ -974,7 +976,7 @@ def _tokenize_text(self, X, y=None, **kwargs):
974976
from .nlp.huggingface.utils import tokenize_text
975977
from .nlp.utils import is_a_list_of_str
976978

977-
is_str = str(X.dtypes[0]) in ("string", "str")
979+
is_str = str(X.dtypes.iloc[0]) in ("string", "str")
978980
is_list_of_str = is_a_list_of_str(X[list(X.keys())[0]].to_list()[0])
979981

980982
if is_str or is_list_of_str:

flaml/automl/nlp/huggingface/training_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from flaml.automl.task.task import NLG_TASKS
66

77
try:
8-
from transformers import TrainingArguments
8+
from transformers import Seq2SeqTrainingArguments as TrainingArguments
99
except ImportError:
1010
TrainingArguments = object
1111

flaml/automl/nlp/huggingface/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ def get_this_model(checkpoint_path, task, model_config):
396396

397397
if task in (SEQCLASSIFICATION, SEQREGRESSION):
398398
return AutoModelForSequenceClassification.from_pretrained(
399-
checkpoint_path, config=model_config, ignore_mismatched_sizes=True
399+
checkpoint_path, config=model_config, ignore_mismatched_sizes=True, trust_remote_code=True
400400
)
401401
elif task == TOKENCLASSIFICATION:
402402
return AutoModelForTokenClassification.from_pretrained(checkpoint_path, config=model_config)

flaml/automl/task/time_series_task.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ def validate_data(
151151
raise ValueError("Must supply either X_train_all and y_train_all, or dataframe and label")
152152

153153
try:
154-
dataframe[self.time_col] = pd.to_datetime(dataframe[self.time_col])
154+
dataframe.loc[:, self.time_col] = pd.to_datetime(dataframe[self.time_col])
155155
except Exception:
156156
raise ValueError(
157157
f"For '{TS_FORECAST}' task, time column {self.time_col} must contain timestamp values."

flaml/automl/time_series/sklearn.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ def __init__(
7676
self.pca = None
7777

7878
def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
79+
if "is_retrain" in kwargs:
80+
kwargs.pop("is_retrain")
7981
self._X = X
8082
self._y = y
8183

@@ -92,7 +94,14 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
9294

9395
for i, model in enumerate(self.models):
9496
offset = i + self.lags
95-
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
97+
if len(X) - offset > 2:
98+
# series with length 2 will meet All features are either constant or ignored.
99+
# TODO: see why the non-constant features are ignored. Selector?
100+
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
101+
elif len(X) > offset and "catboost" not in str(model).lower():
102+
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
103+
else:
104+
print("[INFO]: Length of data should longer than period + lags.")
96105
return self
97106

98107
def predict(self, X, X_train=None, y_train=None):

flaml/automl/time_series/ts_data.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,12 @@ def X_val(self) -> pd.DataFrame:
121121

122122
@property
123123
def X_all(self) -> pd.DataFrame:
124-
return pd.concat([self.X_train, self.X_val], axis=0)
124+
# Remove empty or all-NA columns before concatenation
125+
X_train_filtered = self.X_train.dropna(axis=1, how="all")
126+
X_val_filtered = self.X_val.dropna(axis=1, how="all")
127+
128+
# Concatenate the filtered DataFrames
129+
return pd.concat([X_train_filtered, X_val_filtered], axis=0)
125130

126131
@property
127132
def y_train(self) -> pd.DataFrame:
@@ -472,7 +477,7 @@ def transform(self, X: Union[DataFrame, np.array], y=None):
472477
if "__NAN__" not in X[col].cat.categories:
473478
X[col] = X[col].cat.add_categories("__NAN__").fillna("__NAN__")
474479
else:
475-
X[col] = X[col].fillna("__NAN__")
480+
X[col] = X[col].fillna("__NAN__").infer_objects(copy=False)
476481
X[col] = X[col].astype("category")
477482

478483
for column in self.num_columns:

test/automl/test_regression.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def test_sparse_matrix_regression(self):
130130
)
131131
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings)
132132

133-
def test_parallel(self, hpo_method=None):
133+
def test_parallel_and_pickle(self, hpo_method=None):
134134
automl_experiment = AutoML()
135135
automl_settings = {
136136
"time_budget": 10,
@@ -153,6 +153,18 @@ def test_parallel(self, hpo_method=None):
153153
except ImportError:
154154
return
155155

156+
# test pickle and load_pickle, should work for prediction
157+
automl_experiment.pickle("automl_xgboost_spark.pkl")
158+
automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
159+
assert automl_loaded.best_estimator == automl_experiment.best_estimator
160+
assert automl_loaded.best_loss == automl_experiment.best_loss
161+
automl_loaded.predict(X_train)
162+
163+
import shutil
164+
165+
shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
166+
shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)
167+
156168
def test_sparse_matrix_regression_holdout(self):
157169
X_train = scipy.sparse.random(8, 100)
158170
y_train = np.random.uniform(size=8)

test/spark/test_0sparkml.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def test_spark_synapseml_rank():
165165
_test_spark_synapseml_lightgbm(spark, "rank")
166166

167167

168-
def test_spark_input_df():
168+
def test_spark_input_df_and_pickle():
169169
import pandas as pd
170170

171171
file_url = "https://mmlspark.blob.core.windows.net/publicwasb/company_bankruptcy_prediction_data.csv"
@@ -201,6 +201,19 @@ def test_spark_input_df():
201201
**settings,
202202
)
203203

204+
# test pickle and load_pickle, should work for prediction
205+
automl.pickle("automl_spark.pkl")
206+
automl_loaded = AutoML().load_pickle("automl_spark.pkl")
207+
assert automl_loaded.best_estimator == automl.best_estimator
208+
assert automl_loaded.best_loss == automl.best_loss
209+
automl_loaded.predict(df)
210+
automl_loaded.model.estimator.transform(test_data)
211+
212+
import shutil
213+
214+
shutil.rmtree("automl_spark.pkl", ignore_errors=True)
215+
shutil.rmtree("automl_spark.pkl.flaml_artifacts", ignore_errors=True)
216+
204217
if estimator_list == ["rf_spark"]:
205218
return
206219

@@ -393,13 +406,13 @@ def test_auto_convert_dtypes_spark():
393406

394407

395408
if __name__ == "__main__":
396-
test_spark_synapseml_classification()
397-
test_spark_synapseml_regression()
398-
test_spark_synapseml_rank()
399-
test_spark_input_df()
400-
test_get_random_dataframe()
401-
test_auto_convert_dtypes_pandas()
402-
test_auto_convert_dtypes_spark()
409+
# test_spark_synapseml_classification()
410+
# test_spark_synapseml_regression()
411+
# test_spark_synapseml_rank()
412+
test_spark_input_df_and_pickle()
413+
# test_get_random_dataframe()
414+
# test_auto_convert_dtypes_pandas()
415+
# test_auto_convert_dtypes_spark()
403416

404417
# import cProfile
405418
# import pstats

test/spark/test_automl.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@
2828
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
2929

3030

31-
def test_parallel_xgboost(hpo_method=None, data_size=1000):
31+
def test_parallel_xgboost_and_pickle(hpo_method=None, data_size=1000):
3232
automl_experiment = AutoML()
3333
automl_settings = {
34-
"time_budget": 10,
34+
"time_budget": 30,
3535
"metric": "ap",
3636
"task": "classification",
3737
"log_file_name": "test/sparse_classification.log",
@@ -53,15 +53,27 @@ def test_parallel_xgboost(hpo_method=None, data_size=1000):
5353
print(automl_experiment.best_iteration)
5454
print(automl_experiment.best_estimator)
5555

56+
# test pickle and load_pickle, should work for prediction
57+
automl_experiment.pickle("automl_xgboost_spark.pkl")
58+
automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
59+
assert automl_loaded.best_estimator == automl_experiment.best_estimator
60+
assert automl_loaded.best_loss == automl_experiment.best_loss
61+
automl_loaded.predict(X_train)
62+
63+
import shutil
64+
65+
shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
66+
shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)
67+
5668

5769
def test_parallel_xgboost_others():
5870
# use random search as the hpo_method
59-
test_parallel_xgboost(hpo_method="random")
71+
test_parallel_xgboost_and_pickle(hpo_method="random")
6072

6173

6274
@pytest.mark.skip(reason="currently not supporting too large data, will support spark dataframe in the future")
6375
def test_large_dataset():
64-
test_parallel_xgboost(data_size=90000000)
76+
test_parallel_xgboost_and_pickle(data_size=90000000)
6577

6678

6779
@pytest.mark.skipif(
@@ -95,10 +107,10 @@ def test_custom_learner(data_size=1000):
95107

96108

97109
if __name__ == "__main__":
98-
test_parallel_xgboost()
99-
test_parallel_xgboost_others()
100-
# test_large_dataset()
101-
if skip_my_learner:
102-
print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
103-
else:
104-
test_custom_learner()
110+
test_parallel_xgboost_and_pickle()
111+
# test_parallel_xgboost_others()
112+
# # test_large_dataset()
113+
# if skip_my_learner:
114+
# print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
115+
# else:
116+
# test_custom_learner()

0 commit comments

Comments
 (0)