[SPARK-56607][PYTHON][FOLLOWUP] Use pyspark.sql.DataFrame to support connect-only

gaogaotiantian · HyukjinKwon · commit 2df302d3a0c8 · 2026-05-02T08:51:54.000+09:00
### What changes were proposed in this pull request? Use `pyspark.sql.DataFrame`, not the classic one, in `mlutils.py`. ### Why are the changes needed? We have connect only CI which does not even have class DataFrame. This util should work with connect DataFrame too. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? `test_pipeline` and `test_parity_pipeline` passed locally. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #55630 from gaogaotiantian/fix-mlutils. Authored-by: Tian Gao <gaogaotiantian@hotmail.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/testing/mlutils.py b/python/pyspark/testing/mlutils.py
@@ -25,8 +25,7 @@
 from pyspark.ml.classification import Classifier, ClassificationModel
 from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
 from pyspark.ml.wrapper import _java2py
-from pyspark.sql import SparkSession
-from pyspark.sql.classic.dataframe import DataFrame
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import DoubleType
 from pyspark.testing.utils import ReusedPySparkTestCase as PySparkTestCase
 
@@ -100,6 +99,11 @@ def tearDownClass(cls):
 
 
 class MockDataset(DataFrame):
+    def __new__(cls, *args, **kwargs):
+        # DataFrame by default creates classic DataFrame, we need this to
+        # overwrite the default behavior.
+        return object.__new__(cls)
+
     def __init__(self):
         self.index = 0