From 6a8a89b08fc851f5fa1c87ff7d2743f29c75f043 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 20 Jan 2026 04:08:35 +0000 Subject: [PATCH 1/7] Initial plan From ca407482f3190f072df524490c32e134723c3cf2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 20 Jan 2026 04:19:53 +0000 Subject: [PATCH 2/7] Add public preprocess() API methods for AutoML and estimators Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --- flaml/automl/automl.py | 44 ++++++ flaml/automl/model.py | 29 ++++ test/automl/test_preprocess_api.py | 229 +++++++++++++++++++++++++++++ 3 files changed, 302 insertions(+) create mode 100644 test/automl/test_preprocess_api.py diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index 3a4ce2c37c..97a2b68476 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -815,6 +815,50 @@ def predict_proba(self, X, **pred_kwargs): proba = self._trained_estimator.predict_proba(X, **pred_kwargs) return proba + def preprocess( + self, + X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame, + ): + """Preprocess data using task-level preprocessing. + + This method applies task-level preprocessing transformations to the input data, + including handling of data types, sparse matrices, and feature transformations + that were learned during the fit phase. This should be called before any + estimator-level preprocessing. + + Args: + X: A numpy array or pandas dataframe or pyspark.pandas dataframe + of featurized instances, shape n * m, + or for time series forecast tasks: + a pandas dataframe with the first column containing + timestamp values (datetime type) or an integer n for + the predict steps (only valid when the estimator is + arima or sarimax). Other columns in the dataframe + are assumed to be exogenous variables (categorical + or numeric). + + Returns: + Preprocessed data in the same format as input (numpy array, DataFrame, etc.). + + Raises: + AttributeError: If the model has not been fitted yet. + + Example: + ```python + automl = AutoML() + automl.fit(X_train, y_train, task="classification") + + # Apply task-level preprocessing to new data + X_test_preprocessed = automl.preprocess(X_test) + ``` + """ + if not hasattr(self, "_state") or self._state is None: + raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.") + if not hasattr(self, "_transformer"): + raise AttributeError("Transformer not initialized. Please call fit() first.") + + return self._state.task.preprocess(X, self._transformer) + def add_learner(self, learner_name, learner_class): """Add a customized learner. diff --git a/flaml/automl/model.py b/flaml/automl/model.py index 0c6c47cec8..040c383608 100644 --- a/flaml/automl/model.py +++ b/flaml/automl/model.py @@ -295,6 +295,35 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): train_time = self._fit(X_train, y_train, **kwargs) return train_time + def preprocess(self, X): + """Preprocess data using estimator-level preprocessing. + + This method applies estimator-specific preprocessing transformations to the input data. + This is the second level of preprocessing that should be applied after task-level + preprocessing (automl.preprocess()). Different estimator types may apply different + preprocessing steps (e.g., sparse matrix conversion, dataframe handling). + + Args: + X: A numpy array or a dataframe of featurized instances, shape n*m. + + Returns: + Preprocessed data ready for the estimator's predict/fit methods. + + Example: + ```python + automl = AutoML() + automl.fit(X_train, y_train, task="classification") + + # First apply task-level preprocessing + X_test_task = automl.preprocess(X_test) + + # Then apply estimator-level preprocessing + estimator = automl.model + X_test_estimator = estimator.preprocess(X_test_task) + ``` + """ + return self._preprocess(X) + def predict(self, X, **kwargs): """Predict label from features. diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py new file mode 100644 index 0000000000..1d9ef75d30 --- /dev/null +++ b/test/automl/test_preprocess_api.py @@ -0,0 +1,229 @@ +"""Tests for the public preprocessor APIs.""" +import unittest + +import numpy as np +import pandas as pd +from sklearn.datasets import load_breast_cancer, load_diabetes + +from flaml import AutoML + + +class TestPreprocessAPI(unittest.TestCase): + """Test cases for the public preprocess() API methods.""" + + def test_automl_preprocess_before_fit(self): + """Test that calling preprocess before fit raises an error.""" + automl = AutoML() + X_test = np.array([[1, 2, 3], [4, 5, 6]]) + + with self.assertRaises(AttributeError) as context: + automl.preprocess(X_test) + # Check that an error is raised about not being fitted + self.assertIn("fit()", str(context.exception)) + + def test_automl_preprocess_classification(self): + """Test task-level preprocessing for classification.""" + # Load dataset + X, y = load_breast_cancer(return_X_y=True) + X_train, y_train = X[:400], y[:400] + X_test = X[400:450] + + # Train AutoML + automl = AutoML() + automl_settings = { + "time_budget": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Test task-level preprocessing + X_preprocessed = automl.preprocess(X_test) + + # Verify the output is not None and has the right shape + self.assertIsNotNone(X_preprocessed) + self.assertEqual(X_preprocessed.shape[0], X_test.shape[0]) + + def test_automl_preprocess_regression(self): + """Test task-level preprocessing for regression.""" + # Load dataset + X, y = load_diabetes(return_X_y=True) + X_train, y_train = X[:300], y[:300] + X_test = X[300:350] + + # Train AutoML + automl = AutoML() + automl_settings = { + "time_budget": 5, + "task": "regression", + "metric": "r2", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Test task-level preprocessing + X_preprocessed = automl.preprocess(X_test) + + # Verify the output + self.assertIsNotNone(X_preprocessed) + self.assertEqual(X_preprocessed.shape[0], X_test.shape[0]) + + def test_automl_preprocess_with_dataframe(self): + """Test task-level preprocessing with pandas DataFrame.""" + # Create a simple dataset + X_train = pd.DataFrame({ + "feature1": [1, 2, 3, 4, 5] * 20, + "feature2": [5, 4, 3, 2, 1] * 20, + "category": ["a", "b", "a", "b", "a"] * 20, + }) + y_train = pd.Series([0, 1, 0, 1, 0] * 20) + + X_test = pd.DataFrame({ + "feature1": [6, 7, 8], + "feature2": [1, 2, 3], + "category": ["a", "b", "a"], + }) + + # Train AutoML + automl = AutoML() + automl_settings = { + "time_budget": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Test preprocessing + X_preprocessed = automl.preprocess(X_test) + + # Verify the output + self.assertIsNotNone(X_preprocessed) + # The preprocessed data should have the same number of rows + self.assertEqual(len(X_preprocessed) if hasattr(X_preprocessed, '__len__') else X_preprocessed.shape[0], + len(X_test)) + + def test_estimator_preprocess(self): + """Test estimator-level preprocessing.""" + # Load dataset + X, y = load_breast_cancer(return_X_y=True) + X_train, y_train = X[:400], y[:400] + X_test = X[400:450] + + # Train AutoML + automl = AutoML() + automl_settings = { + "time_budget": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Get the trained estimator + estimator = automl.model + self.assertIsNotNone(estimator) + + # First apply task-level preprocessing + X_task_preprocessed = automl.preprocess(X_test) + + # Then apply estimator-level preprocessing + X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed) + + # Verify the output + self.assertIsNotNone(X_estimator_preprocessed) + self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0]) + + def test_preprocess_pipeline(self): + """Test the complete preprocessing pipeline (task-level then estimator-level).""" + # Load dataset + X, y = load_breast_cancer(return_X_y=True) + X_train, y_train = X[:400], y[:400] + X_test = X[400:450] + + # Train AutoML + automl = AutoML() + automl_settings = { + "time_budget": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Apply the complete preprocessing pipeline + X_task_preprocessed = automl.preprocess(X_test) + X_final = automl.model.preprocess(X_task_preprocessed) + + # Verify predictions work with preprocessed data + # The internal predict already does this preprocessing, + # but we verify our manual preprocessing gives consistent results + y_pred_manual = automl.model._model.predict(X_final) + y_pred_auto = automl.predict(X_test) + + # Both should give the same predictions + np.testing.assert_array_equal(y_pred_manual, y_pred_auto) + + def test_preprocess_with_mixed_types(self): + """Test preprocessing with mixed data types.""" + # Create dataset with mixed types + X_train = pd.DataFrame({ + "numeric1": np.random.rand(100), + "numeric2": np.random.randint(0, 100, 100), + "categorical": np.random.choice(["cat", "dog", "bird"], 100), + "boolean": np.random.choice([True, False], 100), + }) + y_train = pd.Series(np.random.randint(0, 2, 100)) + + X_test = pd.DataFrame({ + "numeric1": np.random.rand(10), + "numeric2": np.random.randint(0, 100, 10), + "categorical": np.random.choice(["cat", "dog", "bird"], 10), + "boolean": np.random.choice([True, False], 10), + }) + + # Train AutoML + automl = AutoML() + automl_settings = { + "time_budget": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Test preprocessing + X_preprocessed = automl.preprocess(X_test) + + # Verify the output + self.assertIsNotNone(X_preprocessed) + + def test_estimator_preprocess_without_automl(self): + """Test that estimator.preprocess() can be used independently.""" + from flaml.automl.model import LGBMEstimator + + # Create a simple estimator + X_train = np.random.rand(100, 5) + y_train = np.random.randint(0, 2, 100) + + estimator = LGBMEstimator(task="classification") + estimator.fit(X_train, y_train) + + # Test preprocessing + X_test = np.random.rand(10, 5) + X_preprocessed = estimator.preprocess(X_test) + + # Verify the output + self.assertIsNotNone(X_preprocessed) + self.assertEqual(X_preprocessed.shape, X_test.shape) + + +if __name__ == "__main__": + unittest.main() From 382350d82cc6fe2e19ddb20bdf75e2a5d5a607ba Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 20 Jan 2026 04:22:19 +0000 Subject: [PATCH 3/7] Add documentation for preprocess() API methods Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --- .../docs/Use-Cases/Task-Oriented-AutoML.md | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md index f43e6d7466..96afe1623a 100644 --- a/website/docs/Use-Cases/Task-Oriented-AutoML.md +++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md @@ -704,6 +704,64 @@ plt.barh( ![png](images/feature_importance.png) +### Preprocess data + +FLAML provides two levels of preprocessing that can be accessed as public APIs: + +1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training. + +2. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost). + +The task-level preprocessing should be applied before the estimator-level preprocessing. + +#### Task-level preprocessing + +```python +from flaml import AutoML +import numpy as np + +# Train the model +automl = AutoML() +automl.fit(X_train, y_train, task="classification", time_budget=60) + +# Apply task-level preprocessing to new data +X_test_preprocessed = automl.preprocess(X_test) + +# Now you can use this with the estimator +predictions = automl.model.predict(X_test_preprocessed) +``` + +#### Estimator-level preprocessing + +```python +# Get the trained estimator +estimator = automl.model + +# Apply task-level preprocessing first +X_test_task = automl.preprocess(X_test) + +# Then apply estimator-level preprocessing +X_test_estimator = estimator.preprocess(X_test_task) + +# Use the fully preprocessed data with the underlying model +predictions = estimator._model.predict(X_test_estimator) +``` + +#### Complete preprocessing pipeline + +For most use cases, the `predict()` method already handles both levels of preprocessing internally. However, if you need to apply preprocessing separately (e.g., for custom inference pipelines or debugging), you can use: + +```python +# Complete preprocessing pipeline +X_task_preprocessed = automl.preprocess(X_test) +X_final = automl.model.preprocess(X_task_preprocessed) + +# This is equivalent to what happens internally in: +predictions = automl.predict(X_test) +``` + +**Note**: The `preprocess()` methods can only be called after `fit()` has been executed, as they rely on the transformations learned during training. + ### Get best configuration We can find the best estimator's name and best configuration by: From bdf6c53c5d2c45bc8c6e427bd749affdd917787f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 20 Jan 2026 04:23:12 +0000 Subject: [PATCH 4/7] Add example script demonstrating preprocess() API usage Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --- notebook/preprocess_api_example.py | 96 ++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 notebook/preprocess_api_example.py diff --git a/notebook/preprocess_api_example.py b/notebook/preprocess_api_example.py new file mode 100644 index 0000000000..3dcb232396 --- /dev/null +++ b/notebook/preprocess_api_example.py @@ -0,0 +1,96 @@ +""" +Example demonstrating the use of FLAML's preprocess() API. + +This script shows how to use both task-level and estimator-level preprocessing +APIs exposed by FLAML AutoML. +""" + +from flaml import AutoML +from sklearn.datasets import load_breast_cancer +from sklearn.model_selection import train_test_split +import numpy as np + +# Load and split data +print("Loading breast cancer dataset...") +X, y = load_breast_cancer(return_X_y=True) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +print(f"Training data shape: {X_train.shape}") +print(f"Test data shape: {X_test.shape}") + +# Train AutoML model +print("\nTraining AutoML model...") +automl = AutoML() +automl_settings = { + "time_budget": 10, # 10 seconds + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm", "xgboost"], + "verbose": 0, +} +automl.fit(X_train, y_train, **automl_settings) + +print(f"Best estimator: {automl.best_estimator}") +print(f"Best accuracy: {1 - automl.best_loss:.4f}") + +# Example 1: Using task-level preprocessing +print("\n" + "=" * 60) +print("Example 1: Task-level preprocessing") +print("=" * 60) +X_test_task = automl.preprocess(X_test) +print(f"Original test data shape: {X_test.shape}") +print(f"After task preprocessing: {X_test_task.shape}") + +# Example 2: Using estimator-level preprocessing +print("\n" + "=" * 60) +print("Example 2: Estimator-level preprocessing") +print("=" * 60) +estimator = automl.model +X_test_estimator = estimator.preprocess(X_test_task) +print(f"After estimator preprocessing: {X_test_estimator.shape}") + +# Example 3: Complete preprocessing pipeline +print("\n" + "=" * 60) +print("Example 3: Complete preprocessing pipeline") +print("=" * 60) +# Apply both levels of preprocessing +X_preprocessed = automl.preprocess(X_test) +X_final = automl.model.preprocess(X_preprocessed) + +# Manual prediction using fully preprocessed data +y_pred_manual = automl.model._model.predict(X_final) + +# Compare with AutoML's predict method (which does preprocessing internally) +y_pred_auto = automl.predict(X_test) + +print(f"Predictions match: {np.array_equal(y_pred_manual, y_pred_auto)}") +print(f"Manual prediction sample: {y_pred_manual[:5]}") +print(f"Auto prediction sample: {y_pred_auto[:5]}") + +# Example 4: Using preprocessing for custom inference +print("\n" + "=" * 60) +print("Example 4: Custom inference with preprocessing") +print("=" * 60) +# You might want to apply preprocessing separately for: +# - Debugging +# - Custom inference pipelines +# - Integration with other tools + +# Get preprocessed features +X_features = automl.preprocess(X_test) +X_features = automl.model.preprocess(X_features) + +# Now you can use these features with the underlying model or for analysis +print(f"Preprocessed features ready for custom use: {X_features.shape}") +print(f"Feature statistics - Mean: {np.mean(X_features):.4f}, Std: {np.std(X_features):.4f}") + +print("\n" + "=" * 60) +print("Summary") +print("=" * 60) +print("The preprocess() API allows you to:") +print("1. Apply task-level preprocessing with automl.preprocess()") +print("2. Apply estimator-level preprocessing with estimator.preprocess()") +print("3. Chain both for complete preprocessing pipeline") +print("4. Use preprocessed data for custom inference or analysis") +print("\nNote: Task-level preprocessing should always be applied before") +print(" estimator-level preprocessing.") From 61ea727572a3b2a32510f271c33ccd5f4a69a3b9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 20 Jan 2026 04:25:40 +0000 Subject: [PATCH 5/7] Address code review feedback - fix type hints and simplify test logic Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --- flaml/automl/automl.py | 4 ++-- test/automl/test_preprocess_api.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index 97a2b68476..990fdf7e99 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -749,7 +749,7 @@ def score( def predict( self, - X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame, + X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame, **pred_kwargs, ): """Predict label from features. @@ -817,7 +817,7 @@ def predict_proba(self, X, **pred_kwargs): def preprocess( self, - X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame, + X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame, ): """Preprocess data using task-level preprocessing. diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py index 1d9ef75d30..412b7e2f98 100644 --- a/test/automl/test_preprocess_api.py +++ b/test/automl/test_preprocess_api.py @@ -101,11 +101,10 @@ def test_automl_preprocess_with_dataframe(self): # Test preprocessing X_preprocessed = automl.preprocess(X_test) - # Verify the output + # Verify the output - check the number of rows matches self.assertIsNotNone(X_preprocessed) - # The preprocessed data should have the same number of rows - self.assertEqual(len(X_preprocessed) if hasattr(X_preprocessed, '__len__') else X_preprocessed.shape[0], - len(X_test)) + preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, '__len__') else X_preprocessed.shape[0] + self.assertEqual(preprocessed_len, len(X_test)) def test_estimator_preprocess(self): """Test estimator-level preprocessing.""" From 254ef3927f83ad2c7f03242e66d76cefd10ec3fd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:41:41 +0000 Subject: [PATCH 6/7] Fix formatting issues with pre-commit hooks Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --- flaml/automl/automl.py | 4 +- flaml/automl/model.py | 4 +- notebook/preprocess_api_example.py | 5 +- test/automl/test_preprocess_api.py | 112 ++++++++++-------- .../docs/Use-Cases/Task-Oriented-AutoML.md | 2 +- 5 files changed, 68 insertions(+), 59 deletions(-) diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index e96063f1fa..6dda360d8a 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -852,7 +852,7 @@ def preprocess( ```python automl = AutoML() automl.fit(X_train, y_train, task="classification") - + # Apply task-level preprocessing to new data X_test_preprocessed = automl.preprocess(X_test) ``` @@ -861,7 +861,7 @@ def preprocess( raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.") if not hasattr(self, "_transformer"): raise AttributeError("Transformer not initialized. Please call fit() first.") - + return self._state.task.preprocess(X, self._transformer) def add_learner(self, learner_name, learner_class): diff --git a/flaml/automl/model.py b/flaml/automl/model.py index 040c383608..be99ad8b34 100644 --- a/flaml/automl/model.py +++ b/flaml/automl/model.py @@ -313,10 +313,10 @@ def preprocess(self, X): ```python automl = AutoML() automl.fit(X_train, y_train, task="classification") - + # First apply task-level preprocessing X_test_task = automl.preprocess(X_test) - + # Then apply estimator-level preprocessing estimator = automl.model X_test_estimator = estimator.preprocess(X_test_task) diff --git a/notebook/preprocess_api_example.py b/notebook/preprocess_api_example.py index 3dcb232396..fdd4613788 100644 --- a/notebook/preprocess_api_example.py +++ b/notebook/preprocess_api_example.py @@ -5,10 +5,11 @@ APIs exposed by FLAML AutoML. """ -from flaml import AutoML +import numpy as np from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split -import numpy as np + +from flaml import AutoML # Load and split data print("Loading breast cancer dataset...") diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py index 412b7e2f98..2a68c9e47c 100644 --- a/test/automl/test_preprocess_api.py +++ b/test/automl/test_preprocess_api.py @@ -15,7 +15,7 @@ def test_automl_preprocess_before_fit(self): """Test that calling preprocess before fit raises an error.""" automl = AutoML() X_test = np.array([[1, 2, 3], [4, 5, 6]]) - + with self.assertRaises(AttributeError) as context: automl.preprocess(X_test) # Check that an error is raised about not being fitted @@ -27,7 +27,7 @@ def test_automl_preprocess_classification(self): X, y = load_breast_cancer(return_X_y=True) X_train, y_train = X[:400], y[:400] X_test = X[400:450] - + # Train AutoML automl = AutoML() automl_settings = { @@ -38,10 +38,10 @@ def test_automl_preprocess_classification(self): "verbose": 0, } automl.fit(X_train, y_train, **automl_settings) - + # Test task-level preprocessing X_preprocessed = automl.preprocess(X_test) - + # Verify the output is not None and has the right shape self.assertIsNotNone(X_preprocessed) self.assertEqual(X_preprocessed.shape[0], X_test.shape[0]) @@ -52,7 +52,7 @@ def test_automl_preprocess_regression(self): X, y = load_diabetes(return_X_y=True) X_train, y_train = X[:300], y[:300] X_test = X[300:350] - + # Train AutoML automl = AutoML() automl_settings = { @@ -63,10 +63,10 @@ def test_automl_preprocess_regression(self): "verbose": 0, } automl.fit(X_train, y_train, **automl_settings) - + # Test task-level preprocessing X_preprocessed = automl.preprocess(X_test) - + # Verify the output self.assertIsNotNone(X_preprocessed) self.assertEqual(X_preprocessed.shape[0], X_test.shape[0]) @@ -74,19 +74,23 @@ def test_automl_preprocess_regression(self): def test_automl_preprocess_with_dataframe(self): """Test task-level preprocessing with pandas DataFrame.""" # Create a simple dataset - X_train = pd.DataFrame({ - "feature1": [1, 2, 3, 4, 5] * 20, - "feature2": [5, 4, 3, 2, 1] * 20, - "category": ["a", "b", "a", "b", "a"] * 20, - }) + X_train = pd.DataFrame( + { + "feature1": [1, 2, 3, 4, 5] * 20, + "feature2": [5, 4, 3, 2, 1] * 20, + "category": ["a", "b", "a", "b", "a"] * 20, + } + ) y_train = pd.Series([0, 1, 0, 1, 0] * 20) - - X_test = pd.DataFrame({ - "feature1": [6, 7, 8], - "feature2": [1, 2, 3], - "category": ["a", "b", "a"], - }) - + + X_test = pd.DataFrame( + { + "feature1": [6, 7, 8], + "feature2": [1, 2, 3], + "category": ["a", "b", "a"], + } + ) + # Train AutoML automl = AutoML() automl_settings = { @@ -97,13 +101,13 @@ def test_automl_preprocess_with_dataframe(self): "verbose": 0, } automl.fit(X_train, y_train, **automl_settings) - + # Test preprocessing X_preprocessed = automl.preprocess(X_test) - + # Verify the output - check the number of rows matches self.assertIsNotNone(X_preprocessed) - preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, '__len__') else X_preprocessed.shape[0] + preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, "__len__") else X_preprocessed.shape[0] self.assertEqual(preprocessed_len, len(X_test)) def test_estimator_preprocess(self): @@ -112,7 +116,7 @@ def test_estimator_preprocess(self): X, y = load_breast_cancer(return_X_y=True) X_train, y_train = X[:400], y[:400] X_test = X[400:450] - + # Train AutoML automl = AutoML() automl_settings = { @@ -123,17 +127,17 @@ def test_estimator_preprocess(self): "verbose": 0, } automl.fit(X_train, y_train, **automl_settings) - + # Get the trained estimator estimator = automl.model self.assertIsNotNone(estimator) - + # First apply task-level preprocessing X_task_preprocessed = automl.preprocess(X_test) - + # Then apply estimator-level preprocessing X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed) - + # Verify the output self.assertIsNotNone(X_estimator_preprocessed) self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0]) @@ -144,7 +148,7 @@ def test_preprocess_pipeline(self): X, y = load_breast_cancer(return_X_y=True) X_train, y_train = X[:400], y[:400] X_test = X[400:450] - + # Train AutoML automl = AutoML() automl_settings = { @@ -155,38 +159,42 @@ def test_preprocess_pipeline(self): "verbose": 0, } automl.fit(X_train, y_train, **automl_settings) - + # Apply the complete preprocessing pipeline X_task_preprocessed = automl.preprocess(X_test) X_final = automl.model.preprocess(X_task_preprocessed) - + # Verify predictions work with preprocessed data - # The internal predict already does this preprocessing, + # The internal predict already does this preprocessing, # but we verify our manual preprocessing gives consistent results y_pred_manual = automl.model._model.predict(X_final) y_pred_auto = automl.predict(X_test) - + # Both should give the same predictions np.testing.assert_array_equal(y_pred_manual, y_pred_auto) def test_preprocess_with_mixed_types(self): """Test preprocessing with mixed data types.""" # Create dataset with mixed types - X_train = pd.DataFrame({ - "numeric1": np.random.rand(100), - "numeric2": np.random.randint(0, 100, 100), - "categorical": np.random.choice(["cat", "dog", "bird"], 100), - "boolean": np.random.choice([True, False], 100), - }) + X_train = pd.DataFrame( + { + "numeric1": np.random.rand(100), + "numeric2": np.random.randint(0, 100, 100), + "categorical": np.random.choice(["cat", "dog", "bird"], 100), + "boolean": np.random.choice([True, False], 100), + } + ) y_train = pd.Series(np.random.randint(0, 2, 100)) - - X_test = pd.DataFrame({ - "numeric1": np.random.rand(10), - "numeric2": np.random.randint(0, 100, 10), - "categorical": np.random.choice(["cat", "dog", "bird"], 10), - "boolean": np.random.choice([True, False], 10), - }) - + + X_test = pd.DataFrame( + { + "numeric1": np.random.rand(10), + "numeric2": np.random.randint(0, 100, 10), + "categorical": np.random.choice(["cat", "dog", "bird"], 10), + "boolean": np.random.choice([True, False], 10), + } + ) + # Train AutoML automl = AutoML() automl_settings = { @@ -197,28 +205,28 @@ def test_preprocess_with_mixed_types(self): "verbose": 0, } automl.fit(X_train, y_train, **automl_settings) - + # Test preprocessing X_preprocessed = automl.preprocess(X_test) - + # Verify the output self.assertIsNotNone(X_preprocessed) def test_estimator_preprocess_without_automl(self): """Test that estimator.preprocess() can be used independently.""" from flaml.automl.model import LGBMEstimator - + # Create a simple estimator X_train = np.random.rand(100, 5) y_train = np.random.randint(0, 2, 100) - + estimator = LGBMEstimator(task="classification") estimator.fit(X_train, y_train) - + # Test preprocessing X_test = np.random.rand(10, 5) X_preprocessed = estimator.preprocess(X_test) - + # Verify the output self.assertIsNotNone(X_preprocessed) self.assertEqual(X_preprocessed.shape, X_test.shape) diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md index 30ef131121..602602c77c 100644 --- a/website/docs/Use-Cases/Task-Oriented-AutoML.md +++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md @@ -732,7 +732,7 @@ FLAML provides two levels of preprocessing that can be accessed as public APIs: 1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training. -2. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost). +1. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost). The task-level preprocessing should be applied before the estimator-level preprocessing. From a70bfbacd2589759d4b06aa822e6911e7f9d97ee Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Wed, 21 Jan 2026 14:33:42 +0800 Subject: [PATCH 7/7] Remove example.py, make tests faster --- notebook/preprocess_api_example.py | 97 ------------------------------ test/automl/test_preprocess_api.py | 12 ++-- 2 files changed, 6 insertions(+), 103 deletions(-) delete mode 100644 notebook/preprocess_api_example.py diff --git a/notebook/preprocess_api_example.py b/notebook/preprocess_api_example.py deleted file mode 100644 index fdd4613788..0000000000 --- a/notebook/preprocess_api_example.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -Example demonstrating the use of FLAML's preprocess() API. - -This script shows how to use both task-level and estimator-level preprocessing -APIs exposed by FLAML AutoML. -""" - -import numpy as np -from sklearn.datasets import load_breast_cancer -from sklearn.model_selection import train_test_split - -from flaml import AutoML - -# Load and split data -print("Loading breast cancer dataset...") -X, y = load_breast_cancer(return_X_y=True) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - -print(f"Training data shape: {X_train.shape}") -print(f"Test data shape: {X_test.shape}") - -# Train AutoML model -print("\nTraining AutoML model...") -automl = AutoML() -automl_settings = { - "time_budget": 10, # 10 seconds - "task": "classification", - "metric": "accuracy", - "estimator_list": ["lgbm", "xgboost"], - "verbose": 0, -} -automl.fit(X_train, y_train, **automl_settings) - -print(f"Best estimator: {automl.best_estimator}") -print(f"Best accuracy: {1 - automl.best_loss:.4f}") - -# Example 1: Using task-level preprocessing -print("\n" + "=" * 60) -print("Example 1: Task-level preprocessing") -print("=" * 60) -X_test_task = automl.preprocess(X_test) -print(f"Original test data shape: {X_test.shape}") -print(f"After task preprocessing: {X_test_task.shape}") - -# Example 2: Using estimator-level preprocessing -print("\n" + "=" * 60) -print("Example 2: Estimator-level preprocessing") -print("=" * 60) -estimator = automl.model -X_test_estimator = estimator.preprocess(X_test_task) -print(f"After estimator preprocessing: {X_test_estimator.shape}") - -# Example 3: Complete preprocessing pipeline -print("\n" + "=" * 60) -print("Example 3: Complete preprocessing pipeline") -print("=" * 60) -# Apply both levels of preprocessing -X_preprocessed = automl.preprocess(X_test) -X_final = automl.model.preprocess(X_preprocessed) - -# Manual prediction using fully preprocessed data -y_pred_manual = automl.model._model.predict(X_final) - -# Compare with AutoML's predict method (which does preprocessing internally) -y_pred_auto = automl.predict(X_test) - -print(f"Predictions match: {np.array_equal(y_pred_manual, y_pred_auto)}") -print(f"Manual prediction sample: {y_pred_manual[:5]}") -print(f"Auto prediction sample: {y_pred_auto[:5]}") - -# Example 4: Using preprocessing for custom inference -print("\n" + "=" * 60) -print("Example 4: Custom inference with preprocessing") -print("=" * 60) -# You might want to apply preprocessing separately for: -# - Debugging -# - Custom inference pipelines -# - Integration with other tools - -# Get preprocessed features -X_features = automl.preprocess(X_test) -X_features = automl.model.preprocess(X_features) - -# Now you can use these features with the underlying model or for analysis -print(f"Preprocessed features ready for custom use: {X_features.shape}") -print(f"Feature statistics - Mean: {np.mean(X_features):.4f}, Std: {np.std(X_features):.4f}") - -print("\n" + "=" * 60) -print("Summary") -print("=" * 60) -print("The preprocess() API allows you to:") -print("1. Apply task-level preprocessing with automl.preprocess()") -print("2. Apply estimator-level preprocessing with estimator.preprocess()") -print("3. Chain both for complete preprocessing pipeline") -print("4. Use preprocessed data for custom inference or analysis") -print("\nNote: Task-level preprocessing should always be applied before") -print(" estimator-level preprocessing.") diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py index 2a68c9e47c..45b9c6143b 100644 --- a/test/automl/test_preprocess_api.py +++ b/test/automl/test_preprocess_api.py @@ -31,7 +31,7 @@ def test_automl_preprocess_classification(self): # Train AutoML automl = AutoML() automl_settings = { - "time_budget": 5, + "max_iter": 5, "task": "classification", "metric": "accuracy", "estimator_list": ["lgbm"], @@ -56,7 +56,7 @@ def test_automl_preprocess_regression(self): # Train AutoML automl = AutoML() automl_settings = { - "time_budget": 5, + "max_iter": 5, "task": "regression", "metric": "r2", "estimator_list": ["lgbm"], @@ -94,7 +94,7 @@ def test_automl_preprocess_with_dataframe(self): # Train AutoML automl = AutoML() automl_settings = { - "time_budget": 5, + "max_iter": 5, "task": "classification", "metric": "accuracy", "estimator_list": ["lgbm"], @@ -120,7 +120,7 @@ def test_estimator_preprocess(self): # Train AutoML automl = AutoML() automl_settings = { - "time_budget": 5, + "max_iter": 5, "task": "classification", "metric": "accuracy", "estimator_list": ["lgbm"], @@ -152,7 +152,7 @@ def test_preprocess_pipeline(self): # Train AutoML automl = AutoML() automl_settings = { - "time_budget": 5, + "max_iter": 5, "task": "classification", "metric": "accuracy", "estimator_list": ["lgbm"], @@ -198,7 +198,7 @@ def test_preprocess_with_mixed_types(self): # Train AutoML automl = AutoML() automl_settings = { - "time_budget": 5, + "max_iter": 5, "task": "classification", "metric": "accuracy", "estimator_list": ["lgbm"],