Skip to content

Commit 00e796f

Browse files
Copilotthinkall
andauthored
Fix pandas 3.0 compatibility: StringDtype, datetime resolution, deprecated APIs, pyspark.pandas import (#1527)
* Initial plan * Fix pandas 3.0 compatibility: StringDtype, datetime resolution, deprecated APIs - Add "str" to string dtype name checks in data.py, ts_data.py, generic_task.py (pandas 3.0 uses StringDtype(na_value=nan) with dtype.name="str") - Wrap np.issubdtype(columns.dtype, np.integer) in try/except for StringDtype column indexes in pandas 3.0 - Use is_datetime64_any_dtype() instead of hardcoded "datetime64[ns]" check (pandas 3.0 uses varying datetime resolutions like datetime64[s], [ms], [us]) - Replace fillna(method="ffill") with ffill() in test files (removed in pandas 3.0) - Replace deprecated frequency aliases "T"->"min", "H"->"h" in test files Agent-Logs-Url: https://github.com/microsoft/FLAML/sessions/e2339d06-2236-4c1e-901b-b00a558f3796 Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * CI: pin pandas 2 on ubuntu+python3.12 for backward compat testing Agent-Logs-Url: https://github.com/microsoft/FLAML/sessions/7c3ce4f8-8623-4d3e-8986-2fa83e1daa5a Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add is_datetime64_any_dtype fallback in ts_data.py except ImportError block Agent-Logs-Url: https://github.com/microsoft/FLAML/sessions/2b20aa86-81b1-4b3b-b6af-0cf7669c5575 Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Guard is_datetime64_any_dtype calls with None check in ts_data.py Agent-Logs-Url: https://github.com/microsoft/FLAML/sessions/395c2eef-887f-4278-a4e0-4d4d276afeee Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix Spark test failures: separate core pyspark imports from pyspark.pandas Agent-Logs-Url: https://github.com/microsoft/FLAML/sessions/28e597b7-ae5e-41ba-957d-04aaa78834bd Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Narrow pyspark.pandas fallback to ImportError instead of broad Exception Agent-Logs-Url: https://github.com/microsoft/FLAML/sessions/28e597b7-ae5e-41ba-957d-04aaa78834bd Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Guard set_option and ps.from_pandas calls in to_pandas_on_spark when pyspark.pandas unavailable Agent-Logs-Url: https://github.com/microsoft/FLAML/sessions/60ca1746-e262-40e3-a970-2db40bda57dd Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Pin pandas<3 for all pyspark environments and add pandas<3 to spark extra in setup.py Agent-Logs-Url: https://github.com/microsoft/FLAML/sessions/0512ff2f-87ab-4394-b688-ea1416c26c6a Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com>
1 parent 82ebcf5 commit 00e796f

10 files changed

Lines changed: 53 additions & 21 deletions

File tree

.github/workflows/python-package.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ jobs:
6464
pip install -e .
6565
python -c "import flaml"
6666
pip install -e .[test]
67+
- name: On Ubuntu with pyspark, pin pandas<3 (pyspark doesn't support pandas 3.0 yet)
68+
if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.10'
69+
run: |
70+
pip install "pandas>=2.0,<3"
6771
- name: On Ubuntu python 3.11, install pyspark 3.5.1
6872
if: matrix.python-version == '3.11' && matrix.os == 'ubuntu-latest'
6973
run: |

flaml/automl/data.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616
from flaml.automl.spark import DataFrame, F, Series, T, pd, ps, psDataFrame, psSeries
1717
from flaml.automl.training_log import training_log_reader
1818

19+
try:
20+
from pandas.api.types import is_datetime64_any_dtype
21+
except ImportError:
22+
is_datetime64_any_dtype = None
23+
1924
try:
2025
from scipy.sparse import issparse, vstack
2126
except ImportError:
@@ -302,7 +307,7 @@ def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Ta
302307
y = y.rename(TS_VALUE_COL)
303308
for column in X.columns:
304309
# sklearn\utils\validation.py needs int/float values
305-
if X[column].dtype.name in ("object", "category", "string"):
310+
if X[column].dtype.name in ("object", "category", "string", "str"):
306311
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
307312
X.drop(columns=column, inplace=True)
308313
drop = True
@@ -318,7 +323,7 @@ def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Ta
318323
X.drop(columns=column, inplace=True)
319324
drop = True
320325
else: # datetime or numeric
321-
if X[column].dtype.name == "datetime64[ns]":
326+
if is_datetime64_any_dtype is not None and is_datetime64_any_dtype(X[column]):
322327
tmp_dt = X[column].dt
323328
new_columns_dict = {
324329
f"year_{column}": tmp_dt.year,
@@ -347,9 +352,11 @@ def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Ta
347352
X[cat_columns] = X[cat_columns].astype("category")
348353
if num_columns:
349354
X_num = X[num_columns]
350-
if np.issubdtype(X_num.columns.dtype, np.integer) and (
351-
drop or min(X_num.columns) != 0 or max(X_num.columns) != X_num.shape[1] - 1
352-
):
355+
try:
356+
is_int_cols = np.issubdtype(X_num.columns.dtype, np.integer)
357+
except TypeError:
358+
is_int_cols = False
359+
if is_int_cols and (drop or min(X_num.columns) != 0 or max(X_num.columns) != X_num.shape[1] - 1):
353360
X_num.columns = range(X_num.shape[1])
354361
drop = True
355362
else:
@@ -435,7 +442,7 @@ def transform(self, X: Union[DataFrame, np.array]):
435442
if self._task.is_ts_forecast():
436443
X.insert(0, TS_TIMESTAMP_COL, ds_col)
437444
for column in cat_columns:
438-
if X[column].dtype.name == "object":
445+
if X[column].dtype.name in ("object", "string", "str"):
439446
X[column] = X[column].fillna("__NAN__")
440447
elif X[column].dtype.name == "category":
441448
current_categories = X[column].cat.categories

flaml/automl/spark/__init__.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,8 @@
55
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
66
try:
77
import pyspark
8-
import pyspark.pandas as ps
98
import pyspark.sql.functions as F
109
import pyspark.sql.types as T
11-
from pyspark.pandas import DataFrame as psDataFrame
12-
from pyspark.pandas import Series as psSeries
13-
from pyspark.pandas import set_option
1410
from pyspark.sql import DataFrame as sparkDataFrame
1511
from pyspark.sql import SparkSession
1612
from pyspark.util import VersionUtils
@@ -29,6 +25,20 @@ class psDataFrame:
2925
else:
3026
ERROR = None
3127
_spark_major_minor_version = VersionUtils.majorMinorVersion(pyspark.__version__)
28+
# pyspark.pandas may fail with newer pandas versions (e.g., pandas 3.0)
29+
# but core pyspark functionality should still work
30+
try:
31+
import pyspark.pandas as ps
32+
from pyspark.pandas import DataFrame as psDataFrame
33+
from pyspark.pandas import Series as psSeries
34+
from pyspark.pandas import set_option
35+
except ImportError:
36+
37+
class psDataFrame:
38+
pass
39+
40+
ps = psSeries = psDataFrame
41+
set_option = None
3242

3343
try:
3444
import pandas as pd

flaml/automl/spark/utils.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ def to_pandas_on_spark(
5858
print(pss)
5959
```
6060
"""
61-
set_option("compute.default_index_type", default_index_type)
61+
if set_option is not None:
62+
set_option("compute.default_index_type", default_index_type)
6263
try:
6364
orig_ps_conf = ps.get_option("compute.fail_on_ansi_mode")
6465
except Exception:
@@ -68,7 +69,14 @@ def to_pandas_on_spark(
6869

6970
try:
7071
if isinstance(df, (DataFrame, Series)):
71-
return ps.from_pandas(df)
72+
if set_option is not None:
73+
return ps.from_pandas(df)
74+
else:
75+
raise ImportError(
76+
"pyspark.pandas is not available (likely incompatible with installed pandas version). "
77+
"Cannot convert pandas DataFrame/Series to pandas-on-Spark. "
78+
"Consider downgrading pandas or upgrading pyspark."
79+
)
7280
elif isinstance(df, sparkDataFrame):
7381
if _spark_major_minor_version[0] == 3 and _spark_major_minor_version[1] < 3:
7482
return df.to_pandas_on_spark(index_col=index_col)

flaml/automl/task/generic_task.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def validate_data(
167167
assert X[column].dtype.name in (
168168
"object",
169169
"string",
170+
"str",
170171
), "If the task is an NLP task, X can only contain text columns"
171172
for _, each_cell in X[column].items():
172173
if each_cell is not None:

flaml/automl/time_series/ts_data.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class PD:
2525
pd.DataFrame = None
2626
pd.Series = None
2727
DataFrame = Series = None
28+
is_datetime64_any_dtype = None
2829

2930

3031
# dataclass will remove empty default value even with field(default_factory=lambda: [])
@@ -272,7 +273,7 @@ def enrich_dataframe(
272273

273274
new_cols = []
274275
for col in df.columns:
275-
if df[col].dtype.name == "datetime64[ns]":
276+
if is_datetime64_any_dtype is not None and is_datetime64_any_dtype(df[col]):
276277
extras = monthly_fourier_features(df[col], fourier_degree)
277278
extras.columns = [f"{col}_{c}" for c in extras.columns]
278279
extras.index = df.index
@@ -403,12 +404,12 @@ def fit(self, X: Union[DataFrame, np.array], y):
403404
continue
404405

405406
# Robust datetime detection (covers datetime64[ms/us/ns], tz-aware, etc.)
406-
if is_datetime64_any_dtype(X[column]):
407+
if is_datetime64_any_dtype is not None and is_datetime64_any_dtype(X[column]):
407408
self.datetime_columns.append(column)
408409
continue
409410

410411
# sklearn/utils/validation.py needs int/float values
411-
if X[column].dtype.name in ("object", "category", "string"):
412+
if X[column].dtype.name in ("object", "category", "string", "str"):
412413
if (
413414
# drop columns where all values are the same
414415
X[column].nunique() == 1

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
],
4848
"spark": [
4949
"pyspark>=3.2.0",
50+
"pandas<3",
5051
"joblibspark>=0.5.0",
5152
"joblib<=1.3.2",
5253
],

test/automl/test_extra_models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,8 @@ def load_multi_dataset():
209209
df["timeStamp"] = pd.to_datetime(df["timeStamp"])
210210
df = df.set_index("timeStamp")
211211
df = df.resample("D").mean()
212-
df["temp"] = df["temp"].fillna(method="ffill")
213-
df["precip"] = df["precip"].fillna(method="ffill")
212+
df["temp"] = df["temp"].ffill()
213+
df["precip"] = df["precip"].ffill()
214214
df = df[:-2] # last two rows are NaN for 'demand' column so remove them
215215
df = df.reset_index()
216216

test/automl/test_forecast.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def test_numpy_large():
163163

164164
from flaml import AutoML
165165

166-
X_train = pd.date_range("2017-01-01", periods=70000, freq="T")
166+
X_train = pd.date_range("2017-01-01", periods=70000, freq="min")
167167
y_train = pd.DataFrame(np.random.randint(6500, 7500, 70000))
168168
automl = AutoML()
169169
automl.fit(
@@ -187,8 +187,8 @@ def load_multi_dataset():
187187
df["timeStamp"] = pd.to_datetime(df["timeStamp"])
188188
df = df.set_index("timeStamp")
189189
df = df.resample("D").mean()
190-
df["temp"] = df["temp"].fillna(method="ffill")
191-
df["precip"] = df["precip"].fillna(method="ffill")
190+
df["temp"] = df["temp"].ffill()
191+
df["precip"] = df["precip"].ffill()
192192
df = df[:-2] # last two rows are NaN for 'demand' column so remove them
193193
df = df.reset_index()
194194

test/automl/test_max_iter_1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
def test_max_iter_1():
9-
date_rng = pd.date_range(start="2024-01-01", periods=100, freq="H")
9+
date_rng = pd.date_range(start="2024-01-01", periods=100, freq="h")
1010
X = pd.DataFrame({"ds": date_rng})
1111
y_train_24h = np.random.rand(len(X)) * 100
1212

0 commit comments

Comments
 (0)