microsoft · thinkall · Jan 10, 2026 · Jan 10, 2026
diff --git a/flaml/automl/ml.py b/flaml/automl/ml.py
@@ -616,7 +616,12 @@ def _eval_estimator(
             logger.warning(f"ValueError {e} happened in `metric_loss_score`, set `val_loss` to `np.inf`")
         metric_for_logging = {"pred_time": pred_time}
         if log_training_metric:
-            train_pred_y = get_y_pred(estimator, X_train, eval_metric, task)
+            # For time series forecasting, X_train may be a sampled dataset whose
+            # test partition can be empty. Use the training partition from X_val
+            # (which is the dataset used to define y_train above) to keep shapes
+            # aligned and avoid empty prediction inputs.
+            X_train_for_metric = X_val.X_train if isinstance(X_val, TimeSeriesDataset) else X_train
+            train_pred_y = get_y_pred(estimator, X_train_for_metric, eval_metric, task)
             metric_for_logging["train_loss"] = metric_loss_score(
                 eval_metric,
                 train_pred_y,

diff --git a/flaml/automl/time_series/tcn.py b/flaml/automl/time_series/tcn.py
@@ -264,7 +264,8 @@ def fit(self, X_train: TimeSeriesDataset, y_train=None, budget=None, **kwargs):
     def predict(self, X):
         X = self.enrich(X)
         if isinstance(X, TimeSeriesDataset):
-            df = X.X_val
+            # Use X_train if X_val is empty (e.g., when computing training metrics)
+            df = X.X_val if len(X.test_data) > 0 else X.X_train
         else:
             df = X
         dataset = DataframeDataset(

diff --git a/flaml/automl/time_series/tft.py b/flaml/automl/time_series/tft.py
@@ -197,7 +197,11 @@ def predict(self, X):
         last_data_cols = self.group_ids.copy()
         last_data_cols.append(self.target_names[0])
         last_data = self.data[lambda x: x.time_idx == x.time_idx.max()][last_data_cols]
-        decoder_data = X.X_val if isinstance(X, TimeSeriesDataset) else X
+        # Use X_train if test_data is empty (e.g., when computing training metrics)
+        if isinstance(X, TimeSeriesDataset):
+            decoder_data = X.X_val if len(X.test_data) > 0 else X.X_train
+        else:
+            decoder_data = X
         if "time_idx" not in decoder_data:
             decoder_data = add_time_idx_col(decoder_data)
         decoder_data["time_idx"] += encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()

diff --git a/flaml/automl/time_series/ts_model.py b/flaml/automl/time_series/ts_model.py
@@ -194,7 +194,13 @@ def predict(self, X: Union[TimeSeriesDataset, DataFrame], **kwargs):
 
         elif isinstance(X, TimeSeriesDataset):
             data = X
-            X = data.test_data[[self.time_col] + X.regressors]
+            # By default we predict on the dataset's test partition.
+            # Some internal call paths (e.g., training-metric logging) may pass a
+            # dataset whose test partition is empty; fall back to train partition.
+            if data.test_data is not None and len(data.test_data):
+                X = data.test_data[data.regressors + [data.time_col]]
+            else:
+                X = data.train_data[data.regressors + [data.time_col]]
 
         if self._model is not None:
             forecast = self._model.predict(X, **kwargs)
@@ -301,7 +307,13 @@ def predict(self, X, **kwargs):
 
         if isinstance(X, TimeSeriesDataset):
             data = X
-            X = data.test_data[data.regressors + [data.time_col]]
+            # By default we predict on the dataset's test partition.
+            # Some internal call paths (e.g., training-metric logging) may pass a
+            # dataset whose test partition is empty; fall back to train partition.
+            if data.test_data is not None and len(data.test_data):
+                X = data.test_data[data.regressors + [data.time_col]]
+            else:
+                X = data.train_data[data.regressors + [data.time_col]]
 
         X = X.rename(columns={self.time_col: "ds"})
         if self._model is not None:
@@ -327,11 +339,19 @@ def predict(self, X, **kwargs) -> pd.Series:
 
         if isinstance(X, TimeSeriesDataset):
             data = X
-            X = data.test_data[data.regressors + [data.time_col]]
+            # By default we predict on the dataset's test partition.
+            # Some internal call paths (e.g., training-metric logging) may pass a
+            # dataset whose test partition is empty; fall back to train partition.
+            if data.test_data is not None and len(data.test_data):
+                X = data.test_data[data.regressors + [data.time_col]]
+            else:
+                X = data.train_data[data.regressors + [data.time_col]]
         else:
             X = X[self.regressors + [self.time_col]]
 
         if isinstance(X, DataFrame):
+            if X.shape[0] == 0:
+                return pd.Series([], name=self.target_names[0], dtype=float)
             start = X[self.time_col].iloc[0]
             end = X[self.time_col].iloc[-1]
             if len(self.regressors):
@@ -829,6 +849,13 @@ def predict(self, X, **kwargs):
         if isinstance(X, TimeSeriesDataset):
             data = X
             X = data.test_data
+            # By default we predict on the dataset's test partition.
+            # Some internal call paths (e.g., training-metric logging) may pass a
+            # dataset whose test partition is empty; fall back to train partition.
+            if data.test_data is not None and len(data.test_data):
+                X = data.test_data
+            else:
+                X = data.train_data
 
         if self._model is not None:
             X = X[self.regressors]

diff --git a/test/automl/test_forecast.py b/test/automl/test_forecast.py
@@ -681,11 +681,55 @@ def split_by_date(df: pd.DataFrame, dt: datetime.date):
     print("yahoo!")
 
 
+def test_log_training_metric_ts_models():
+    """Test that log_training_metric=True works with time series models (arima, sarimax, holt-winters)."""
+    import statsmodels.api as sm
+
+    from flaml.automl.task.time_series_task import TimeSeriesTask
+
+    estimators_all = TimeSeriesTask("forecast").estimators.keys()
+    estimators_to_test = ["xgboost", "arima", "lassolars", "tcn", "snaive", "prophet", "orbit"]
+    estimators = [
+        est for est in estimators_to_test if est in estimators_all
+    ]  # not all estimators available in current python env
+    print(f"Testing estimators: {estimators}")
+
+    # Prepare data
+    data = sm.datasets.co2.load_pandas().data["co2"]
+    data = data.resample("MS").mean()
+    data = data.bfill().ffill()
+    data = data.to_frame().reset_index()
+    data = data.rename(columns={"index": "ds", "co2": "y"})
+    num_samples = data.shape[0]
+    time_horizon = 12
+    split_idx = num_samples - time_horizon
+    df = data[:split_idx]
+
+    # Test each time series model with log_training_metric=True
+    for estimator in estimators:
+        print(f"\nTesting {estimator} with log_training_metric=True")
+        automl = AutoML()
+        settings = {
+            "time_budget": 3,
+            "metric": "mape",
+            "task": "forecast",
+            "eval_method": "holdout",
+            "label": "y",
+            "log_training_metric": True,  # This should not cause errors
+            "estimator_list": [estimator],
+        }
+        automl.fit(dataframe=df, **settings, period=time_horizon, force_cancel=True)
+        print(f"  ✅ {estimator} SUCCESS with log_training_metric=True")
+        if automl.best_estimator:
+            assert automl.best_estimator == estimator
+
+
 if __name__ == "__main__":
     # test_forecast_automl(60)
     # test_multivariate_forecast_num(5)
     # test_multivariate_forecast_cat(5)
-    test_numpy()
+    # test_numpy()
     # test_forecast_classification(5)
     # test_forecast_panel(5)
     # test_cv_step()
+    test_log_training_metric_ts_models()