diff --git a/flaml/automl/time_series/ts_data.py b/flaml/automl/time_series/ts_data.py index 239d02c3ba..4a650db71c 100644 --- a/flaml/automl/time_series/ts_data.py +++ b/flaml/automl/time_series/ts_data.py @@ -245,12 +245,13 @@ def prettify_prediction(self, y_pred: Union[pd.DataFrame, pd.Series, np.ndarray] else: if isinstance(y_pred, np.ndarray): - raise ValueError("Can't enrich np.ndarray as self.test_data is None") + y_pred = pd.DataFrame(data=y_pred, columns=self.target_names) elif isinstance(y_pred, pd.Series): assert len(self.target_names) == 1, "Not enough columns in y_pred" y_pred = pd.DataFrame({self.target_names[0]: y_pred}) - # TODO auto-create the timestamps for the time column instead of throwing - raise NotImplementedError("Need a non-None test_data for this to work, for now") + if self.time_col not in y_pred.columns: + forward_frame = create_forward_frame(self.frequency, len(y_pred), self.end_date, self.time_col) + y_pred[self.time_col] = forward_frame[self.time_col].values assert isinstance(y_pred, pd.DataFrame) assert self.time_col in y_pred.columns @@ -499,10 +500,18 @@ def fit_transform(self, X: Union[DataFrame, np.array], y): def create_forward_frame( frequency: str, steps: int, - test_end_date: datetime.datetime, + last_timestamp: datetime.datetime, time_col: str, ): - start_date = test_end_date + pd.Timedelta(1, frequency) + if frequency is None: + raise ValueError("frequency cannot be None") + if last_timestamp is None or pd.isna(last_timestamp): + raise ValueError(f"last_timestamp cannot be None or NaT, got {last_timestamp!r}") + try: + offset = pd.tseries.frequencies.to_offset(frequency) + except ValueError as e: + raise ValueError(f"Invalid frequency {frequency!r}; expected a pandas offset alias.") from e + start_date = last_timestamp + offset times = pd.date_range( start=start_date, periods=steps, diff --git a/test/automl/test_ts_data.py b/test/automl/test_ts_data.py new file mode 100644 index 0000000000..042eddedf3 --- /dev/null +++ b/test/automl/test_ts_data.py @@ -0,0 +1,63 @@ +import numpy as np +import pandas as pd + +from flaml.automl.time_series.ts_data import TimeSeriesDataset, create_forward_frame + + +def test_prettify_prediction_generates_timestamps_without_test_data(): + train_data = pd.DataFrame( + { + "ds": pd.date_range("2020-01-01", periods=4, freq="D"), + "y": [1.0, 2.0, 3.0, 4.0], + } + ) + dataset = TimeSeriesDataset(train_data, time_col="ds", target_names="y") + expected_times = pd.date_range("2020-01-05", periods=2, freq="D") + + for y_pred in ( + pd.DataFrame({"y": [5.0, 6.0]}, index=[10, 11]), + pd.Series([5.0, 6.0]), + np.array([5.0, 6.0]), + ): + prediction = dataset.prettify_prediction(y_pred) + assert isinstance(prediction, pd.DataFrame) + pd.testing.assert_series_equal(prediction["ds"], pd.Series(expected_times, name="ds"), check_index=False) + assert prediction["y"].tolist() == [5.0, 6.0] + + +def test_prettify_prediction_generates_monthly_timestamps_without_test_data(): + train_data = pd.DataFrame( + { + "ds": pd.date_range("2020-01-01", periods=4, freq="MS"), + "y": [1.0, 2.0, 3.0, 4.0], + } + ) + dataset = TimeSeriesDataset(train_data, time_col="ds", target_names="y") + + prediction = dataset.prettify_prediction(pd.DataFrame({"y": [5.0, 6.0]})) + + pd.testing.assert_series_equal( + prediction["ds"], + pd.Series(pd.date_range("2020-05-01", periods=2, freq="MS"), name="ds"), + check_index=False, + ) + assert prediction["y"].tolist() == [5.0, 6.0] + + +def test_create_forward_frame_uses_next_frequency_offset(): + # Pandas 3 uses QE-DEC while older supported versions use Q-DEC. + quarter_end_freq = "QE-DEC" + try: + pd.tseries.frequencies.to_offset(quarter_end_freq) + except ValueError: + quarter_end_freq = "Q-DEC" + + weekly_frame = create_forward_frame("W-SUN", 2, pd.Timestamp("2020-01-05"), "ds") + quarterly_frame = create_forward_frame(quarter_end_freq, 2, pd.Timestamp("2020-03-31"), "ds") + + pd.testing.assert_series_equal( + weekly_frame["ds"], pd.Series(pd.date_range("2020-01-12", periods=2, freq="W-SUN"), name="ds") + ) + pd.testing.assert_series_equal( + quarterly_frame["ds"], pd.Series(pd.date_range("2020-06-30", periods=2, freq=quarter_end_freq), name="ds") + )