From 6a8a89b08fc851f5fa1c87ff7d2743f29c75f043 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 20 Jan 2026 04:08:35 +0000
Subject: [PATCH 1/7] Initial plan


From ca407482f3190f072df524490c32e134723c3cf2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 20 Jan 2026 04:19:53 +0000
Subject: [PATCH 2/7] Add public preprocess() API methods for AutoML and
 estimators

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 flaml/automl/automl.py             |  44 ++++++
 flaml/automl/model.py              |  29 ++++
 test/automl/test_preprocess_api.py | 229 +++++++++++++++++++++++++++++
 3 files changed, 302 insertions(+)
 create mode 100644 test/automl/test_preprocess_api.py

diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index 3a4ce2c37c..97a2b68476 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -815,6 +815,50 @@ def predict_proba(self, X, **pred_kwargs):
         proba = self._trained_estimator.predict_proba(X, **pred_kwargs)
         return proba
 
+    def preprocess(
+        self,
+        X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame,
+    ):
+        """Preprocess data using task-level preprocessing.
+
+        This method applies task-level preprocessing transformations to the input data,
+        including handling of data types, sparse matrices, and feature transformations
+        that were learned during the fit phase. This should be called before any
+        estimator-level preprocessing.
+
+        Args:
+            X: A numpy array or pandas dataframe or pyspark.pandas dataframe
+                of featurized instances, shape n * m,
+                or for time series forecast tasks:
+                    a pandas dataframe with the first column containing
+                    timestamp values (datetime type) or an integer n for
+                    the predict steps (only valid when the estimator is
+                    arima or sarimax). Other columns in the dataframe
+                    are assumed to be exogenous variables (categorical
+                    or numeric).
+
+        Returns:
+            Preprocessed data in the same format as input (numpy array, DataFrame, etc.).
+
+        Raises:
+            AttributeError: If the model has not been fitted yet.
+
+        Example:
+            ```python
+            automl = AutoML()
+            automl.fit(X_train, y_train, task="classification")
+            
+            # Apply task-level preprocessing to new data
+            X_test_preprocessed = automl.preprocess(X_test)
+            ```
+        """
+        if not hasattr(self, "_state") or self._state is None:
+            raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.")
+        if not hasattr(self, "_transformer"):
+            raise AttributeError("Transformer not initialized. Please call fit() first.")
+        
+        return self._state.task.preprocess(X, self._transformer)
+
     def add_learner(self, learner_name, learner_class):
         """Add a customized learner.
 
diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index 0c6c47cec8..040c383608 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -295,6 +295,35 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
             train_time = self._fit(X_train, y_train, **kwargs)
         return train_time
 
+    def preprocess(self, X):
+        """Preprocess data using estimator-level preprocessing.
+
+        This method applies estimator-specific preprocessing transformations to the input data.
+        This is the second level of preprocessing that should be applied after task-level
+        preprocessing (automl.preprocess()). Different estimator types may apply different
+        preprocessing steps (e.g., sparse matrix conversion, dataframe handling).
+
+        Args:
+            X: A numpy array or a dataframe of featurized instances, shape n*m.
+
+        Returns:
+            Preprocessed data ready for the estimator's predict/fit methods.
+
+        Example:
+            ```python
+            automl = AutoML()
+            automl.fit(X_train, y_train, task="classification")
+            
+            # First apply task-level preprocessing
+            X_test_task = automl.preprocess(X_test)
+            
+            # Then apply estimator-level preprocessing
+            estimator = automl.model
+            X_test_estimator = estimator.preprocess(X_test_task)
+            ```
+        """
+        return self._preprocess(X)
+
     def predict(self, X, **kwargs):
         """Predict label from features.
 
diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py
new file mode 100644
index 0000000000..1d9ef75d30
--- /dev/null
+++ b/test/automl/test_preprocess_api.py
@@ -0,0 +1,229 @@
+"""Tests for the public preprocessor APIs."""
+import unittest
+
+import numpy as np
+import pandas as pd
+from sklearn.datasets import load_breast_cancer, load_diabetes
+
+from flaml import AutoML
+
+
+class TestPreprocessAPI(unittest.TestCase):
+    """Test cases for the public preprocess() API methods."""
+
+    def test_automl_preprocess_before_fit(self):
+        """Test that calling preprocess before fit raises an error."""
+        automl = AutoML()
+        X_test = np.array([[1, 2, 3], [4, 5, 6]])
+        
+        with self.assertRaises(AttributeError) as context:
+            automl.preprocess(X_test)
+        # Check that an error is raised about not being fitted
+        self.assertIn("fit()", str(context.exception))
+
+    def test_automl_preprocess_classification(self):
+        """Test task-level preprocessing for classification."""
+        # Load dataset
+        X, y = load_breast_cancer(return_X_y=True)
+        X_train, y_train = X[:400], y[:400]
+        X_test = X[400:450]
+        
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "time_budget": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+        
+        # Test task-level preprocessing
+        X_preprocessed = automl.preprocess(X_test)
+        
+        # Verify the output is not None and has the right shape
+        self.assertIsNotNone(X_preprocessed)
+        self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
+
+    def test_automl_preprocess_regression(self):
+        """Test task-level preprocessing for regression."""
+        # Load dataset
+        X, y = load_diabetes(return_X_y=True)
+        X_train, y_train = X[:300], y[:300]
+        X_test = X[300:350]
+        
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "time_budget": 5,
+            "task": "regression",
+            "metric": "r2",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+        
+        # Test task-level preprocessing
+        X_preprocessed = automl.preprocess(X_test)
+        
+        # Verify the output
+        self.assertIsNotNone(X_preprocessed)
+        self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
+
+    def test_automl_preprocess_with_dataframe(self):
+        """Test task-level preprocessing with pandas DataFrame."""
+        # Create a simple dataset
+        X_train = pd.DataFrame({
+            "feature1": [1, 2, 3, 4, 5] * 20,
+            "feature2": [5, 4, 3, 2, 1] * 20,
+            "category": ["a", "b", "a", "b", "a"] * 20,
+        })
+        y_train = pd.Series([0, 1, 0, 1, 0] * 20)
+        
+        X_test = pd.DataFrame({
+            "feature1": [6, 7, 8],
+            "feature2": [1, 2, 3],
+            "category": ["a", "b", "a"],
+        })
+        
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "time_budget": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+        
+        # Test preprocessing
+        X_preprocessed = automl.preprocess(X_test)
+        
+        # Verify the output
+        self.assertIsNotNone(X_preprocessed)
+        # The preprocessed data should have the same number of rows
+        self.assertEqual(len(X_preprocessed) if hasattr(X_preprocessed, '__len__') else X_preprocessed.shape[0], 
+                         len(X_test))
+
+    def test_estimator_preprocess(self):
+        """Test estimator-level preprocessing."""
+        # Load dataset
+        X, y = load_breast_cancer(return_X_y=True)
+        X_train, y_train = X[:400], y[:400]
+        X_test = X[400:450]
+        
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "time_budget": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+        
+        # Get the trained estimator
+        estimator = automl.model
+        self.assertIsNotNone(estimator)
+        
+        # First apply task-level preprocessing
+        X_task_preprocessed = automl.preprocess(X_test)
+        
+        # Then apply estimator-level preprocessing
+        X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed)
+        
+        # Verify the output
+        self.assertIsNotNone(X_estimator_preprocessed)
+        self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0])
+
+    def test_preprocess_pipeline(self):
+        """Test the complete preprocessing pipeline (task-level then estimator-level)."""
+        # Load dataset
+        X, y = load_breast_cancer(return_X_y=True)
+        X_train, y_train = X[:400], y[:400]
+        X_test = X[400:450]
+        
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "time_budget": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+        
+        # Apply the complete preprocessing pipeline
+        X_task_preprocessed = automl.preprocess(X_test)
+        X_final = automl.model.preprocess(X_task_preprocessed)
+        
+        # Verify predictions work with preprocessed data
+        # The internal predict already does this preprocessing, 
+        # but we verify our manual preprocessing gives consistent results
+        y_pred_manual = automl.model._model.predict(X_final)
+        y_pred_auto = automl.predict(X_test)
+        
+        # Both should give the same predictions
+        np.testing.assert_array_equal(y_pred_manual, y_pred_auto)
+
+    def test_preprocess_with_mixed_types(self):
+        """Test preprocessing with mixed data types."""
+        # Create dataset with mixed types
+        X_train = pd.DataFrame({
+            "numeric1": np.random.rand(100),
+            "numeric2": np.random.randint(0, 100, 100),
+            "categorical": np.random.choice(["cat", "dog", "bird"], 100),
+            "boolean": np.random.choice([True, False], 100),
+        })
+        y_train = pd.Series(np.random.randint(0, 2, 100))
+        
+        X_test = pd.DataFrame({
+            "numeric1": np.random.rand(10),
+            "numeric2": np.random.randint(0, 100, 10),
+            "categorical": np.random.choice(["cat", "dog", "bird"], 10),
+            "boolean": np.random.choice([True, False], 10),
+        })
+        
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "time_budget": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+        
+        # Test preprocessing
+        X_preprocessed = automl.preprocess(X_test)
+        
+        # Verify the output
+        self.assertIsNotNone(X_preprocessed)
+
+    def test_estimator_preprocess_without_automl(self):
+        """Test that estimator.preprocess() can be used independently."""
+        from flaml.automl.model import LGBMEstimator
+        
+        # Create a simple estimator
+        X_train = np.random.rand(100, 5)
+        y_train = np.random.randint(0, 2, 100)
+        
+        estimator = LGBMEstimator(task="classification")
+        estimator.fit(X_train, y_train)
+        
+        # Test preprocessing
+        X_test = np.random.rand(10, 5)
+        X_preprocessed = estimator.preprocess(X_test)
+        
+        # Verify the output
+        self.assertIsNotNone(X_preprocessed)
+        self.assertEqual(X_preprocessed.shape, X_test.shape)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 382350d82cc6fe2e19ddb20bdf75e2a5d5a607ba Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 20 Jan 2026 04:22:19 +0000
Subject: [PATCH 3/7] Add documentation for preprocess() API methods

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 .../docs/Use-Cases/Task-Oriented-AutoML.md    | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md
index f43e6d7466..96afe1623a 100644
--- a/website/docs/Use-Cases/Task-Oriented-AutoML.md
+++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md
@@ -704,6 +704,64 @@ plt.barh(
 
 ![png](images/feature_importance.png)
 
+### Preprocess data
+
+FLAML provides two levels of preprocessing that can be accessed as public APIs:
+
+1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training.
+
+2. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost).
+
+The task-level preprocessing should be applied before the estimator-level preprocessing.
+
+#### Task-level preprocessing
+
+```python
+from flaml import AutoML
+import numpy as np
+
+# Train the model
+automl = AutoML()
+automl.fit(X_train, y_train, task="classification", time_budget=60)
+
+# Apply task-level preprocessing to new data
+X_test_preprocessed = automl.preprocess(X_test)
+
+# Now you can use this with the estimator
+predictions = automl.model.predict(X_test_preprocessed)
+```
+
+#### Estimator-level preprocessing
+
+```python
+# Get the trained estimator
+estimator = automl.model
+
+# Apply task-level preprocessing first
+X_test_task = automl.preprocess(X_test)
+
+# Then apply estimator-level preprocessing
+X_test_estimator = estimator.preprocess(X_test_task)
+
+# Use the fully preprocessed data with the underlying model
+predictions = estimator._model.predict(X_test_estimator)
+```
+
+#### Complete preprocessing pipeline
+
+For most use cases, the `predict()` method already handles both levels of preprocessing internally. However, if you need to apply preprocessing separately (e.g., for custom inference pipelines or debugging), you can use:
+
+```python
+# Complete preprocessing pipeline
+X_task_preprocessed = automl.preprocess(X_test)
+X_final = automl.model.preprocess(X_task_preprocessed)
+
+# This is equivalent to what happens internally in:
+predictions = automl.predict(X_test)
+```
+
+**Note**: The `preprocess()` methods can only be called after `fit()` has been executed, as they rely on the transformations learned during training.
+
 ### Get best configuration
 
 We can find the best estimator's name and best configuration by:

From bdf6c53c5d2c45bc8c6e427bd749affdd917787f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 20 Jan 2026 04:23:12 +0000
Subject: [PATCH 4/7] Add example script demonstrating preprocess() API usage

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 notebook/preprocess_api_example.py | 96 ++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 notebook/preprocess_api_example.py

diff --git a/notebook/preprocess_api_example.py b/notebook/preprocess_api_example.py
new file mode 100644
index 0000000000..3dcb232396
--- /dev/null
+++ b/notebook/preprocess_api_example.py
@@ -0,0 +1,96 @@
+"""
+Example demonstrating the use of FLAML's preprocess() API.
+
+This script shows how to use both task-level and estimator-level preprocessing
+APIs exposed by FLAML AutoML.
+"""
+
+from flaml import AutoML
+from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
+import numpy as np
+
+# Load and split data
+print("Loading breast cancer dataset...")
+X, y = load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+print(f"Training data shape: {X_train.shape}")
+print(f"Test data shape: {X_test.shape}")
+
+# Train AutoML model
+print("\nTraining AutoML model...")
+automl = AutoML()
+automl_settings = {
+    "time_budget": 10,  # 10 seconds
+    "task": "classification",
+    "metric": "accuracy",
+    "estimator_list": ["lgbm", "xgboost"],
+    "verbose": 0,
+}
+automl.fit(X_train, y_train, **automl_settings)
+
+print(f"Best estimator: {automl.best_estimator}")
+print(f"Best accuracy: {1 - automl.best_loss:.4f}")
+
+# Example 1: Using task-level preprocessing
+print("\n" + "=" * 60)
+print("Example 1: Task-level preprocessing")
+print("=" * 60)
+X_test_task = automl.preprocess(X_test)
+print(f"Original test data shape: {X_test.shape}")
+print(f"After task preprocessing: {X_test_task.shape}")
+
+# Example 2: Using estimator-level preprocessing
+print("\n" + "=" * 60)
+print("Example 2: Estimator-level preprocessing")
+print("=" * 60)
+estimator = automl.model
+X_test_estimator = estimator.preprocess(X_test_task)
+print(f"After estimator preprocessing: {X_test_estimator.shape}")
+
+# Example 3: Complete preprocessing pipeline
+print("\n" + "=" * 60)
+print("Example 3: Complete preprocessing pipeline")
+print("=" * 60)
+# Apply both levels of preprocessing
+X_preprocessed = automl.preprocess(X_test)
+X_final = automl.model.preprocess(X_preprocessed)
+
+# Manual prediction using fully preprocessed data
+y_pred_manual = automl.model._model.predict(X_final)
+
+# Compare with AutoML's predict method (which does preprocessing internally)
+y_pred_auto = automl.predict(X_test)
+
+print(f"Predictions match: {np.array_equal(y_pred_manual, y_pred_auto)}")
+print(f"Manual prediction sample: {y_pred_manual[:5]}")
+print(f"Auto prediction sample: {y_pred_auto[:5]}")
+
+# Example 4: Using preprocessing for custom inference
+print("\n" + "=" * 60)
+print("Example 4: Custom inference with preprocessing")
+print("=" * 60)
+# You might want to apply preprocessing separately for:
+# - Debugging
+# - Custom inference pipelines
+# - Integration with other tools
+
+# Get preprocessed features
+X_features = automl.preprocess(X_test)
+X_features = automl.model.preprocess(X_features)
+
+# Now you can use these features with the underlying model or for analysis
+print(f"Preprocessed features ready for custom use: {X_features.shape}")
+print(f"Feature statistics - Mean: {np.mean(X_features):.4f}, Std: {np.std(X_features):.4f}")
+
+print("\n" + "=" * 60)
+print("Summary")
+print("=" * 60)
+print("The preprocess() API allows you to:")
+print("1. Apply task-level preprocessing with automl.preprocess()")
+print("2. Apply estimator-level preprocessing with estimator.preprocess()")
+print("3. Chain both for complete preprocessing pipeline")
+print("4. Use preprocessed data for custom inference or analysis")
+print("\nNote: Task-level preprocessing should always be applied before")
+print("      estimator-level preprocessing.")

From 61ea727572a3b2a32510f271c33ccd5f4a69a3b9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 20 Jan 2026 04:25:40 +0000
Subject: [PATCH 5/7] Address code review feedback - fix type hints and
 simplify test logic

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 flaml/automl/automl.py             | 4 ++--
 test/automl/test_preprocess_api.py | 7 +++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index 97a2b68476..990fdf7e99 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -749,7 +749,7 @@ def score(
 
     def predict(
         self,
-        X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame,
+        X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
         **pred_kwargs,
     ):
         """Predict label from features.
@@ -817,7 +817,7 @@ def predict_proba(self, X, **pred_kwargs):
 
     def preprocess(
         self,
-        X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame,
+        X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
     ):
         """Preprocess data using task-level preprocessing.
 
diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py
index 1d9ef75d30..412b7e2f98 100644
--- a/test/automl/test_preprocess_api.py
+++ b/test/automl/test_preprocess_api.py
@@ -101,11 +101,10 @@ def test_automl_preprocess_with_dataframe(self):
         # Test preprocessing
         X_preprocessed = automl.preprocess(X_test)
         
-        # Verify the output
+        # Verify the output - check the number of rows matches
         self.assertIsNotNone(X_preprocessed)
-        # The preprocessed data should have the same number of rows
-        self.assertEqual(len(X_preprocessed) if hasattr(X_preprocessed, '__len__') else X_preprocessed.shape[0], 
-                         len(X_test))
+        preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, '__len__') else X_preprocessed.shape[0]
+        self.assertEqual(preprocessed_len, len(X_test))
 
     def test_estimator_preprocess(self):
         """Test estimator-level preprocessing."""

From 254ef3927f83ad2c7f03242e66d76cefd10ec3fd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 20 Jan 2026 15:41:41 +0000
Subject: [PATCH 6/7] Fix formatting issues with pre-commit hooks

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 flaml/automl/automl.py                        |   4 +-
 flaml/automl/model.py                         |   4 +-
 notebook/preprocess_api_example.py            |   5 +-
 test/automl/test_preprocess_api.py            | 112 ++++++++++--------
 .../docs/Use-Cases/Task-Oriented-AutoML.md    |   2 +-
 5 files changed, 68 insertions(+), 59 deletions(-)

diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index e96063f1fa..6dda360d8a 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -852,7 +852,7 @@ def preprocess(
             ```python
             automl = AutoML()
             automl.fit(X_train, y_train, task="classification")
-            
+
             # Apply task-level preprocessing to new data
             X_test_preprocessed = automl.preprocess(X_test)
             ```
@@ -861,7 +861,7 @@ def preprocess(
             raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.")
         if not hasattr(self, "_transformer"):
             raise AttributeError("Transformer not initialized. Please call fit() first.")
-        
+
         return self._state.task.preprocess(X, self._transformer)
 
     def add_learner(self, learner_name, learner_class):
diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index 040c383608..be99ad8b34 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -313,10 +313,10 @@ def preprocess(self, X):
             ```python
             automl = AutoML()
             automl.fit(X_train, y_train, task="classification")
-            
+
             # First apply task-level preprocessing
             X_test_task = automl.preprocess(X_test)
-            
+
             # Then apply estimator-level preprocessing
             estimator = automl.model
             X_test_estimator = estimator.preprocess(X_test_task)
diff --git a/notebook/preprocess_api_example.py b/notebook/preprocess_api_example.py
index 3dcb232396..fdd4613788 100644
--- a/notebook/preprocess_api_example.py
+++ b/notebook/preprocess_api_example.py
@@ -5,10 +5,11 @@
 APIs exposed by FLAML AutoML.
 """
 
-from flaml import AutoML
+import numpy as np
 from sklearn.datasets import load_breast_cancer
 from sklearn.model_selection import train_test_split
-import numpy as np
+
+from flaml import AutoML
 
 # Load and split data
 print("Loading breast cancer dataset...")
diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py
index 412b7e2f98..2a68c9e47c 100644
--- a/test/automl/test_preprocess_api.py
+++ b/test/automl/test_preprocess_api.py
@@ -15,7 +15,7 @@ def test_automl_preprocess_before_fit(self):
         """Test that calling preprocess before fit raises an error."""
         automl = AutoML()
         X_test = np.array([[1, 2, 3], [4, 5, 6]])
-        
+
         with self.assertRaises(AttributeError) as context:
             automl.preprocess(X_test)
         # Check that an error is raised about not being fitted
@@ -27,7 +27,7 @@ def test_automl_preprocess_classification(self):
         X, y = load_breast_cancer(return_X_y=True)
         X_train, y_train = X[:400], y[:400]
         X_test = X[400:450]
-        
+
         # Train AutoML
         automl = AutoML()
         automl_settings = {
@@ -38,10 +38,10 @@ def test_automl_preprocess_classification(self):
             "verbose": 0,
         }
         automl.fit(X_train, y_train, **automl_settings)
-        
+
         # Test task-level preprocessing
         X_preprocessed = automl.preprocess(X_test)
-        
+
         # Verify the output is not None and has the right shape
         self.assertIsNotNone(X_preprocessed)
         self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
@@ -52,7 +52,7 @@ def test_automl_preprocess_regression(self):
         X, y = load_diabetes(return_X_y=True)
         X_train, y_train = X[:300], y[:300]
         X_test = X[300:350]
-        
+
         # Train AutoML
         automl = AutoML()
         automl_settings = {
@@ -63,10 +63,10 @@ def test_automl_preprocess_regression(self):
             "verbose": 0,
         }
         automl.fit(X_train, y_train, **automl_settings)
-        
+
         # Test task-level preprocessing
         X_preprocessed = automl.preprocess(X_test)
-        
+
         # Verify the output
         self.assertIsNotNone(X_preprocessed)
         self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
@@ -74,19 +74,23 @@ def test_automl_preprocess_regression(self):
     def test_automl_preprocess_with_dataframe(self):
         """Test task-level preprocessing with pandas DataFrame."""
         # Create a simple dataset
-        X_train = pd.DataFrame({
-            "feature1": [1, 2, 3, 4, 5] * 20,
-            "feature2": [5, 4, 3, 2, 1] * 20,
-            "category": ["a", "b", "a", "b", "a"] * 20,
-        })
+        X_train = pd.DataFrame(
+            {
+                "feature1": [1, 2, 3, 4, 5] * 20,
+                "feature2": [5, 4, 3, 2, 1] * 20,
+                "category": ["a", "b", "a", "b", "a"] * 20,
+            }
+        )
         y_train = pd.Series([0, 1, 0, 1, 0] * 20)
-        
-        X_test = pd.DataFrame({
-            "feature1": [6, 7, 8],
-            "feature2": [1, 2, 3],
-            "category": ["a", "b", "a"],
-        })
-        
+
+        X_test = pd.DataFrame(
+            {
+                "feature1": [6, 7, 8],
+                "feature2": [1, 2, 3],
+                "category": ["a", "b", "a"],
+            }
+        )
+
         # Train AutoML
         automl = AutoML()
         automl_settings = {
@@ -97,13 +101,13 @@ def test_automl_preprocess_with_dataframe(self):
             "verbose": 0,
         }
         automl.fit(X_train, y_train, **automl_settings)
-        
+
         # Test preprocessing
         X_preprocessed = automl.preprocess(X_test)
-        
+
         # Verify the output - check the number of rows matches
         self.assertIsNotNone(X_preprocessed)
-        preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, '__len__') else X_preprocessed.shape[0]
+        preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, "__len__") else X_preprocessed.shape[0]
         self.assertEqual(preprocessed_len, len(X_test))
 
     def test_estimator_preprocess(self):
@@ -112,7 +116,7 @@ def test_estimator_preprocess(self):
         X, y = load_breast_cancer(return_X_y=True)
         X_train, y_train = X[:400], y[:400]
         X_test = X[400:450]
-        
+
         # Train AutoML
         automl = AutoML()
         automl_settings = {
@@ -123,17 +127,17 @@ def test_estimator_preprocess(self):
             "verbose": 0,
         }
         automl.fit(X_train, y_train, **automl_settings)
-        
+
         # Get the trained estimator
         estimator = automl.model
         self.assertIsNotNone(estimator)
-        
+
         # First apply task-level preprocessing
         X_task_preprocessed = automl.preprocess(X_test)
-        
+
         # Then apply estimator-level preprocessing
         X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed)
-        
+
         # Verify the output
         self.assertIsNotNone(X_estimator_preprocessed)
         self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0])
@@ -144,7 +148,7 @@ def test_preprocess_pipeline(self):
         X, y = load_breast_cancer(return_X_y=True)
         X_train, y_train = X[:400], y[:400]
         X_test = X[400:450]
-        
+
         # Train AutoML
         automl = AutoML()
         automl_settings = {
@@ -155,38 +159,42 @@ def test_preprocess_pipeline(self):
             "verbose": 0,
         }
         automl.fit(X_train, y_train, **automl_settings)
-        
+
         # Apply the complete preprocessing pipeline
         X_task_preprocessed = automl.preprocess(X_test)
         X_final = automl.model.preprocess(X_task_preprocessed)
-        
+
         # Verify predictions work with preprocessed data
-        # The internal predict already does this preprocessing, 
+        # The internal predict already does this preprocessing,
         # but we verify our manual preprocessing gives consistent results
         y_pred_manual = automl.model._model.predict(X_final)
         y_pred_auto = automl.predict(X_test)
-        
+
         # Both should give the same predictions
         np.testing.assert_array_equal(y_pred_manual, y_pred_auto)
 
     def test_preprocess_with_mixed_types(self):
         """Test preprocessing with mixed data types."""
         # Create dataset with mixed types
-        X_train = pd.DataFrame({
-            "numeric1": np.random.rand(100),
-            "numeric2": np.random.randint(0, 100, 100),
-            "categorical": np.random.choice(["cat", "dog", "bird"], 100),
-            "boolean": np.random.choice([True, False], 100),
-        })
+        X_train = pd.DataFrame(
+            {
+                "numeric1": np.random.rand(100),
+                "numeric2": np.random.randint(0, 100, 100),
+                "categorical": np.random.choice(["cat", "dog", "bird"], 100),
+                "boolean": np.random.choice([True, False], 100),
+            }
+        )
         y_train = pd.Series(np.random.randint(0, 2, 100))
-        
-        X_test = pd.DataFrame({
-            "numeric1": np.random.rand(10),
-            "numeric2": np.random.randint(0, 100, 10),
-            "categorical": np.random.choice(["cat", "dog", "bird"], 10),
-            "boolean": np.random.choice([True, False], 10),
-        })
-        
+
+        X_test = pd.DataFrame(
+            {
+                "numeric1": np.random.rand(10),
+                "numeric2": np.random.randint(0, 100, 10),
+                "categorical": np.random.choice(["cat", "dog", "bird"], 10),
+                "boolean": np.random.choice([True, False], 10),
+            }
+        )
+
         # Train AutoML
         automl = AutoML()
         automl_settings = {
@@ -197,28 +205,28 @@ def test_preprocess_with_mixed_types(self):
             "verbose": 0,
         }
         automl.fit(X_train, y_train, **automl_settings)
-        
+
         # Test preprocessing
         X_preprocessed = automl.preprocess(X_test)
-        
+
         # Verify the output
         self.assertIsNotNone(X_preprocessed)
 
     def test_estimator_preprocess_without_automl(self):
         """Test that estimator.preprocess() can be used independently."""
         from flaml.automl.model import LGBMEstimator
-        
+
         # Create a simple estimator
         X_train = np.random.rand(100, 5)
         y_train = np.random.randint(0, 2, 100)
-        
+
         estimator = LGBMEstimator(task="classification")
         estimator.fit(X_train, y_train)
-        
+
         # Test preprocessing
         X_test = np.random.rand(10, 5)
         X_preprocessed = estimator.preprocess(X_test)
-        
+
         # Verify the output
         self.assertIsNotNone(X_preprocessed)
         self.assertEqual(X_preprocessed.shape, X_test.shape)
diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md
index 30ef131121..602602c77c 100644
--- a/website/docs/Use-Cases/Task-Oriented-AutoML.md
+++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md
@@ -732,7 +732,7 @@ FLAML provides two levels of preprocessing that can be accessed as public APIs:
 
 1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training.
 
-2. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost).
+1. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost).
 
 The task-level preprocessing should be applied before the estimator-level preprocessing.
 

From a70bfbacd2589759d4b06aa822e6911e7f9d97ee Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 21 Jan 2026 14:33:42 +0800
Subject: [PATCH 7/7] Remove example.py, make tests faster

---
 notebook/preprocess_api_example.py | 97 ------------------------------
 test/automl/test_preprocess_api.py | 12 ++--
 2 files changed, 6 insertions(+), 103 deletions(-)
 delete mode 100644 notebook/preprocess_api_example.py

diff --git a/notebook/preprocess_api_example.py b/notebook/preprocess_api_example.py
deleted file mode 100644
index fdd4613788..0000000000
--- a/notebook/preprocess_api_example.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-Example demonstrating the use of FLAML's preprocess() API.
-
-This script shows how to use both task-level and estimator-level preprocessing
-APIs exposed by FLAML AutoML.
-"""
-
-import numpy as np
-from sklearn.datasets import load_breast_cancer
-from sklearn.model_selection import train_test_split
-
-from flaml import AutoML
-
-# Load and split data
-print("Loading breast cancer dataset...")
-X, y = load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
-print(f"Training data shape: {X_train.shape}")
-print(f"Test data shape: {X_test.shape}")
-
-# Train AutoML model
-print("\nTraining AutoML model...")
-automl = AutoML()
-automl_settings = {
-    "time_budget": 10,  # 10 seconds
-    "task": "classification",
-    "metric": "accuracy",
-    "estimator_list": ["lgbm", "xgboost"],
-    "verbose": 0,
-}
-automl.fit(X_train, y_train, **automl_settings)
-
-print(f"Best estimator: {automl.best_estimator}")
-print(f"Best accuracy: {1 - automl.best_loss:.4f}")
-
-# Example 1: Using task-level preprocessing
-print("\n" + "=" * 60)
-print("Example 1: Task-level preprocessing")
-print("=" * 60)
-X_test_task = automl.preprocess(X_test)
-print(f"Original test data shape: {X_test.shape}")
-print(f"After task preprocessing: {X_test_task.shape}")
-
-# Example 2: Using estimator-level preprocessing
-print("\n" + "=" * 60)
-print("Example 2: Estimator-level preprocessing")
-print("=" * 60)
-estimator = automl.model
-X_test_estimator = estimator.preprocess(X_test_task)
-print(f"After estimator preprocessing: {X_test_estimator.shape}")
-
-# Example 3: Complete preprocessing pipeline
-print("\n" + "=" * 60)
-print("Example 3: Complete preprocessing pipeline")
-print("=" * 60)
-# Apply both levels of preprocessing
-X_preprocessed = automl.preprocess(X_test)
-X_final = automl.model.preprocess(X_preprocessed)
-
-# Manual prediction using fully preprocessed data
-y_pred_manual = automl.model._model.predict(X_final)
-
-# Compare with AutoML's predict method (which does preprocessing internally)
-y_pred_auto = automl.predict(X_test)
-
-print(f"Predictions match: {np.array_equal(y_pred_manual, y_pred_auto)}")
-print(f"Manual prediction sample: {y_pred_manual[:5]}")
-print(f"Auto prediction sample: {y_pred_auto[:5]}")
-
-# Example 4: Using preprocessing for custom inference
-print("\n" + "=" * 60)
-print("Example 4: Custom inference with preprocessing")
-print("=" * 60)
-# You might want to apply preprocessing separately for:
-# - Debugging
-# - Custom inference pipelines
-# - Integration with other tools
-
-# Get preprocessed features
-X_features = automl.preprocess(X_test)
-X_features = automl.model.preprocess(X_features)
-
-# Now you can use these features with the underlying model or for analysis
-print(f"Preprocessed features ready for custom use: {X_features.shape}")
-print(f"Feature statistics - Mean: {np.mean(X_features):.4f}, Std: {np.std(X_features):.4f}")
-
-print("\n" + "=" * 60)
-print("Summary")
-print("=" * 60)
-print("The preprocess() API allows you to:")
-print("1. Apply task-level preprocessing with automl.preprocess()")
-print("2. Apply estimator-level preprocessing with estimator.preprocess()")
-print("3. Chain both for complete preprocessing pipeline")
-print("4. Use preprocessed data for custom inference or analysis")
-print("\nNote: Task-level preprocessing should always be applied before")
-print("      estimator-level preprocessing.")
diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py
index 2a68c9e47c..45b9c6143b 100644
--- a/test/automl/test_preprocess_api.py
+++ b/test/automl/test_preprocess_api.py
@@ -31,7 +31,7 @@ def test_automl_preprocess_classification(self):
         # Train AutoML
         automl = AutoML()
         automl_settings = {
-            "time_budget": 5,
+            "max_iter": 5,
             "task": "classification",
             "metric": "accuracy",
             "estimator_list": ["lgbm"],
@@ -56,7 +56,7 @@ def test_automl_preprocess_regression(self):
         # Train AutoML
         automl = AutoML()
         automl_settings = {
-            "time_budget": 5,
+            "max_iter": 5,
             "task": "regression",
             "metric": "r2",
             "estimator_list": ["lgbm"],
@@ -94,7 +94,7 @@ def test_automl_preprocess_with_dataframe(self):
         # Train AutoML
         automl = AutoML()
         automl_settings = {
-            "time_budget": 5,
+            "max_iter": 5,
             "task": "classification",
             "metric": "accuracy",
             "estimator_list": ["lgbm"],
@@ -120,7 +120,7 @@ def test_estimator_preprocess(self):
         # Train AutoML
         automl = AutoML()
         automl_settings = {
-            "time_budget": 5,
+            "max_iter": 5,
             "task": "classification",
             "metric": "accuracy",
             "estimator_list": ["lgbm"],
@@ -152,7 +152,7 @@ def test_preprocess_pipeline(self):
         # Train AutoML
         automl = AutoML()
         automl_settings = {
-            "time_budget": 5,
+            "max_iter": 5,
             "task": "classification",
             "metric": "accuracy",
             "estimator_list": ["lgbm"],
@@ -198,7 +198,7 @@ def test_preprocess_with_mixed_types(self):
         # Train AutoML
         automl = AutoML()
         automl_settings = {
-            "time_budget": 5,
+            "max_iter": 5,
             "task": "classification",
             "metric": "accuracy",
             "estimator_list": ["lgbm"],