Support pickling the whole AutoML instance, Sync Fabric till 0d4ab16f (#1481)

thinkall · web-flow · commit ced1d6f3315d · 2026-01-12T23:04:38.000+08:00
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
diff --git a/flaml/automl/model.py b/flaml/automl/model.py
@@ -135,6 +135,7 @@ def __init__(self, task="binary", **config):
         self._task = task if isinstance(task, Task) else task_factory(task, None, None)
         self.params = self.config2params(config)
         self.estimator_class = self._model = None
+        self.estimator_baseclass = "sklearn"
         if "_estimator_type" in self.params:
             self._estimator_type = self.params.pop("_estimator_type")
         else:
@@ -439,6 +440,7 @@ def __init__(self, task="binary", **config):
             raise SPARK_ERROR
         super().__init__(task, **config)
         self.df_train = None
+        self.estimator_baseclass = "spark"
 
     def _preprocess(
         self,
@@ -974,7 +976,7 @@ def _tokenize_text(self, X, y=None, **kwargs):
         from .nlp.huggingface.utils import tokenize_text
         from .nlp.utils import is_a_list_of_str
 
-        is_str = str(X.dtypes[0]) in ("string", "str")
+        is_str = str(X.dtypes.iloc[0]) in ("string", "str")
         is_list_of_str = is_a_list_of_str(X[list(X.keys())[0]].to_list()[0])
 
         if is_str or is_list_of_str:
diff --git a/flaml/automl/nlp/huggingface/training_args.py b/flaml/automl/nlp/huggingface/training_args.py
@@ -5,7 +5,7 @@
 from flaml.automl.task.task import NLG_TASKS
 
 try:
-    from transformers import TrainingArguments
+    from transformers import Seq2SeqTrainingArguments as TrainingArguments
 except ImportError:
     TrainingArguments = object
 
diff --git a/flaml/automl/nlp/huggingface/utils.py b/flaml/automl/nlp/huggingface/utils.py
@@ -396,7 +396,7 @@ def get_this_model(checkpoint_path, task, model_config):
 
         if task in (SEQCLASSIFICATION, SEQREGRESSION):
             return AutoModelForSequenceClassification.from_pretrained(
-                checkpoint_path, config=model_config, ignore_mismatched_sizes=True
+                checkpoint_path, config=model_config, ignore_mismatched_sizes=True, trust_remote_code=True
             )
         elif task == TOKENCLASSIFICATION:
             return AutoModelForTokenClassification.from_pretrained(checkpoint_path, config=model_config)
diff --git a/flaml/automl/task/time_series_task.py b/flaml/automl/task/time_series_task.py
@@ -151,7 +151,7 @@ def validate_data(
                 raise ValueError("Must supply either X_train_all and y_train_all, or dataframe and label")
 
             try:
-                dataframe[self.time_col] = pd.to_datetime(dataframe[self.time_col])
+                dataframe.loc[:, self.time_col] = pd.to_datetime(dataframe[self.time_col])
             except Exception:
                 raise ValueError(
                     f"For '{TS_FORECAST}' task, time column {self.time_col} must contain timestamp values."
diff --git a/flaml/automl/time_series/sklearn.py b/flaml/automl/time_series/sklearn.py
@@ -76,6 +76,8 @@ def __init__(
             self.pca = None
 
     def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
+        if "is_retrain" in kwargs:
+            kwargs.pop("is_retrain")
         self._X = X
         self._y = y
 
@@ -92,7 +94,14 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
 
         for i, model in enumerate(self.models):
             offset = i + self.lags
-            model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
+            if len(X) - offset > 2:
+                # series with length 2 will meet All features are either constant or ignored.
+                # TODO: see why the non-constant features are ignored. Selector?
+                model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
+            elif len(X) > offset and "catboost" not in str(model).lower():
+                model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
+            else:
+                print("[INFO]: Length of data should longer than period + lags.")
         return self
 
     def predict(self, X, X_train=None, y_train=None):
diff --git a/flaml/automl/time_series/ts_data.py b/flaml/automl/time_series/ts_data.py
@@ -121,7 +121,12 @@ def X_val(self) -> pd.DataFrame:
 
     @property
     def X_all(self) -> pd.DataFrame:
-        return pd.concat([self.X_train, self.X_val], axis=0)
+        # Remove empty or all-NA columns before concatenation
+        X_train_filtered = self.X_train.dropna(axis=1, how="all")
+        X_val_filtered = self.X_val.dropna(axis=1, how="all")
+
+        # Concatenate the filtered DataFrames
+        return pd.concat([X_train_filtered, X_val_filtered], axis=0)
 
     @property
     def y_train(self) -> pd.DataFrame:
@@ -472,7 +477,7 @@ def transform(self, X: Union[DataFrame, np.array], y=None):
                 if "__NAN__" not in X[col].cat.categories:
                     X[col] = X[col].cat.add_categories("__NAN__").fillna("__NAN__")
             else:
-                X[col] = X[col].fillna("__NAN__")
+                X[col] = X[col].fillna("__NAN__").infer_objects(copy=False)
                 X[col] = X[col].astype("category")
 
         for column in self.num_columns:
diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
@@ -130,7 +130,7 @@ def test_sparse_matrix_regression(self):
         )
         automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings)
 
-    def test_parallel(self, hpo_method=None):
+    def test_parallel_and_pickle(self, hpo_method=None):
         automl_experiment = AutoML()
         automl_settings = {
             "time_budget": 10,
@@ -153,6 +153,18 @@ def test_parallel(self, hpo_method=None):
         except ImportError:
             return
 
+        # test pickle and load_pickle, should work for prediction
+        automl_experiment.pickle("automl_xgboost_spark.pkl")
+        automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
+        assert automl_loaded.best_estimator == automl_experiment.best_estimator
+        assert automl_loaded.best_loss == automl_experiment.best_loss
+        automl_loaded.predict(X_train)
+
+        import shutil
+
+        shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
+        shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)
+
     def test_sparse_matrix_regression_holdout(self):
         X_train = scipy.sparse.random(8, 100)
         y_train = np.random.uniform(size=8)
diff --git a/test/spark/test_0sparkml.py b/test/spark/test_0sparkml.py
@@ -165,7 +165,7 @@ def test_spark_synapseml_rank():
     _test_spark_synapseml_lightgbm(spark, "rank")
 
 
-def test_spark_input_df():
+def test_spark_input_df_and_pickle():
     import pandas as pd
 
     file_url = "https://mmlspark.blob.core.windows.net/publicwasb/company_bankruptcy_prediction_data.csv"
@@ -201,6 +201,19 @@ def test_spark_input_df():
         **settings,
     )
 
+    # test pickle and load_pickle, should work for prediction
+    automl.pickle("automl_spark.pkl")
+    automl_loaded = AutoML().load_pickle("automl_spark.pkl")
+    assert automl_loaded.best_estimator == automl.best_estimator
+    assert automl_loaded.best_loss == automl.best_loss
+    automl_loaded.predict(df)
+    automl_loaded.model.estimator.transform(test_data)
+
+    import shutil
+
+    shutil.rmtree("automl_spark.pkl", ignore_errors=True)
+    shutil.rmtree("automl_spark.pkl.flaml_artifacts", ignore_errors=True)
+
     if estimator_list == ["rf_spark"]:
         return
 
@@ -393,13 +406,13 @@ def test_auto_convert_dtypes_spark():
 
 
 if __name__ == "__main__":
-    test_spark_synapseml_classification()
-    test_spark_synapseml_regression()
-    test_spark_synapseml_rank()
-    test_spark_input_df()
-    test_get_random_dataframe()
-    test_auto_convert_dtypes_pandas()
-    test_auto_convert_dtypes_spark()
+    # test_spark_synapseml_classification()
+    # test_spark_synapseml_regression()
+    # test_spark_synapseml_rank()
+    test_spark_input_df_and_pickle()
+    # test_get_random_dataframe()
+    # test_auto_convert_dtypes_pandas()
+    # test_auto_convert_dtypes_spark()
 
     # import cProfile
     # import pstats
diff --git a/test/spark/test_automl.py b/test/spark/test_automl.py
@@ -28,10 +28,10 @@
 pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
 
 
-def test_parallel_xgboost(hpo_method=None, data_size=1000):
+def test_parallel_xgboost_and_pickle(hpo_method=None, data_size=1000):
     automl_experiment = AutoML()
     automl_settings = {
-        "time_budget": 10,
+        "time_budget": 30,
         "metric": "ap",
         "task": "classification",
         "log_file_name": "test/sparse_classification.log",
@@ -53,15 +53,27 @@ def test_parallel_xgboost(hpo_method=None, data_size=1000):
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
 
+    # test pickle and load_pickle, should work for prediction
+    automl_experiment.pickle("automl_xgboost_spark.pkl")
+    automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
+    assert automl_loaded.best_estimator == automl_experiment.best_estimator
+    assert automl_loaded.best_loss == automl_experiment.best_loss
+    automl_loaded.predict(X_train)
+
+    import shutil
+
+    shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
+    shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)
+
 
 def test_parallel_xgboost_others():
     # use random search as the hpo_method
-    test_parallel_xgboost(hpo_method="random")
+    test_parallel_xgboost_and_pickle(hpo_method="random")
 
 
 @pytest.mark.skip(reason="currently not supporting too large data, will support spark dataframe in the future")
 def test_large_dataset():
-    test_parallel_xgboost(data_size=90000000)
+    test_parallel_xgboost_and_pickle(data_size=90000000)
 
 
 @pytest.mark.skipif(
@@ -95,10 +107,10 @@ def test_custom_learner(data_size=1000):
 
 
 if __name__ == "__main__":
-    test_parallel_xgboost()
-    test_parallel_xgboost_others()
-    # test_large_dataset()
-    if skip_my_learner:
-        print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
-    else:
-        test_custom_learner()
+    test_parallel_xgboost_and_pickle()
+    # test_parallel_xgboost_others()
+    # # test_large_dataset()
+    # if skip_my_learner:
+    #     print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
+    # else:
+    #     test_custom_learner()

Original file line number	Diff line number	Diff line change
`@@ -396,7 +396,7 @@ def get_this_model(checkpoint_path, task, model_config):`
`396`	`396`
`397`	`397`	`if task in (SEQCLASSIFICATION, SEQREGRESSION):`
`398`	`398`	`return AutoModelForSequenceClassification.from_pretrained(`
`399`		`- checkpoint_path, config=model_config, ignore_mismatched_sizes=True`
	`399`	`+ checkpoint_path, config=model_config, ignore_mismatched_sizes=True, trust_remote_code=True`
`400`	`400`	`)`
`401`	`401`	`elif task == TOKENCLASSIFICATION:`
`402`	`402`	`return AutoModelForTokenClassification.from_pretrained(checkpoint_path, config=model_config)`