[SPARK-56373][PYSPARK] Add docstring annotations to classify PySpark APIs for Spark Connect compatibility

garlandz-db · garlandz-db · commit 31cd9c7bda40 · 2026-04-07T09:14:21.000Z
Adds three RST directives to PySpark modules, classes, and methods to indicate
Spark Connect compatibility status:

- `.. classic:: true` -- API is only available in Classic Spark (not Spark Connect)
- `.. connect:: true` -- API is available in Spark Connect
- `.. connect_migration:: &lt;message&gt;` -- migration guidance for users transitioning to Spark Connect

Annotations are resolved by inheriting from the nearest annotated ancestor; a child
annotation overrides the parent's. No functional code changes -- docstrings only.

The annotation spec is documented in `python/pyspark/__init__.py`.
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
@@ -44,6 +44,23 @@
       Information about a barrier task.
   - :class:`InheritableThread`:
       A inheritable thread to use in Spark when the pinned thread mode is on.
+
+Spark Connect compatibility annotations
+=======================================
+
+The following RST directives annotate PySpark modules, classes, and methods with
+their Spark Connect compatibility status:
+
+- ``.. classic:: true`` -- the API is only available in Classic Spark (not Spark Connect).
+- ``.. connect:: true`` -- the API is available in Spark Connect.
+- ``.. connect_migration:: <message>`` -- migration guidance for users transitioning
+  from Classic Spark to Spark Connect.
+
+Annotations are resolved by inheriting from the nearest annotated ancestor. A child
+annotation overrides the parent's.
+
+.. classic:: true
+.. connect:: true
 """
 
 import sys
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
@@ -15,6 +15,13 @@
 # limitations under the License.
 #
 
+"""
+.. classic:: true
+
+.. connect_migration:: Use `df.observe(name, *exprs)` to collect named metrics during
+    query execution.
+"""
+
 import os
 import sys
 import select
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
@@ -15,6 +15,13 @@
 # limitations under the License.
 #
 
+"""
+.. classic:: true
+
+.. connect_migration:: Read Spark SQL configuration values using `spark.conf.get(key)`
+    and write them using `spark.conf.set(key, value)`.
+"""
+
 __all__ = ["SparkConf"]
 
 import sys
diff --git a/python/pyspark/core/__init__.py b/python/pyspark/core/__init__.py
@@ -14,3 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+"""
+.. classic:: true
+"""
diff --git a/python/pyspark/core/context.py b/python/pyspark/core/context.py
@@ -558,6 +558,8 @@ def setLogLevel(self, logLevel: str) -> None:
         Examples
         --------
         >>> sc.setLogLevel("WARN")  # doctest :+SKIP
+
+        .. connect_migration:: Replace sc.setLogLevel(level) with spark.log.level(level)
         """
         self._jsc.setLogLevel(logLevel)
 
@@ -630,6 +632,8 @@ def applicationId(self) -> str:
         --------
         >>> sc.applicationId  # doctest: +ELLIPSIS
         'local-...'
+
+        .. connect_migration:: Replace spark.sparkContext.applicationId with spark.conf.get("spark.app.id")
         """
         return self._jsc.sc().applicationId()
 
@@ -675,6 +679,9 @@ def defaultParallelism(self) -> int:
         --------
         >>> sc.defaultParallelism > 0
         True
+
+        .. connect_migration:: Replace spark.sparkContext.defaultParallelism with
+            int(spark.conf.get("spark.default.parallelism", "200"))
         """
         return self._jsc.sc().defaultParallelism()
 
@@ -734,6 +741,9 @@ def emptyRDD(self) -> RDD[Any]:
         EmptyRDD...
         >>> sc.emptyRDD().count()
         0
+
+        .. connect_migration:: Replace sc.emptyRDD with an empty list. When used with
+            createDataFrame: spark.createDataFrame([], schema)
         """
         return RDD(self._jsc.emptyRDD(), self, NoOpSerializer())
 
@@ -828,6 +838,9 @@ def parallelize(self, c: Iterable[T], numSlices: Optional[int] = None) -> RDD[T]
         >>> strings = ["a", "b", "c"]
         >>> sc.parallelize(strings, 2).glom().collect()
         [['a'], ['b', 'c']]
+
+        .. connect_migration:: Replace sc.parallelize(data) with the Python collection directly.
+            When used with createDataFrame: spark.createDataFrame(data, schema)
         """
         numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
         if isinstance(c, range):
@@ -2212,6 +2225,10 @@ def setJobGroup(self, groupId: str, description: str, interruptOnCancel: bool =
         >>> suppress = lock.acquire()
         >>> print(result)
         Cancelled
+
+        .. connect_migration:: Replace sc.setJobGroup(groupId, desc) with
+            spark.conf.set("spark.job.group.id", groupId) and
+            spark.conf.set("spark.job.description", desc)
         """
         self._jsc.setJobGroup(groupId, description, interruptOnCancel)
 
@@ -2410,6 +2427,9 @@ def setLocalProperty(self, key: str, value: str) -> None:
         -----
         If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread
         local inheritance.
+
+        .. connect_migration:: Replace spark.sparkContext.setLocalProperty(key, value) with
+            spark.conf.set(key, value)
         """
         self._jsc.setLocalProperty(key, value)
 
@@ -2441,6 +2461,9 @@ def setJobDescription(self, value: str) -> None:
         -----
         If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread
         local inheritance.
+
+        .. connect_migration:: Replace sc.setJobDescription(desc) with
+            spark.conf.set("spark.job.description", desc)
         """
         self._jsc.setJobDescription(value)
 
@@ -2610,6 +2633,9 @@ def getConf(self) -> SparkConf:
         """Return a copy of this SparkContext's configuration :class:`SparkConf`.
 
         .. versionadded:: 2.1.0
+
+        .. connect_migration:: Replace sc.getConf() with spark.conf. For a specific key use
+            spark.conf.get(key)
         """
         conf = SparkConf()
         conf.setAll(self._conf.getAll())
diff --git a/python/pyspark/core/rdd.py b/python/pyspark/core/rdd.py
@@ -609,6 +609,9 @@ def map(self: "RDD[T]", f: Callable[[T], U], preservesPartitioning: bool = False
         >>> rdd = sc.parallelize(["b", "a", "c"])
         >>> sorted(rdd.map(lambda x: (x, 1)).collect())
         [('a', 1), ('b', 1), ('c', 1)]
+
+        .. connect_migration:: Replace rdd.map() with DataFrame operations. Use
+            df.withColumn(), df.select() with a UDF, or a pandas UDF instead.
         """
 
         def func(_: int, iterator: Iterable[T]) -> Iterable[U]:
@@ -697,6 +700,9 @@ def mapPartitions(
         ...
         >>> rdd.mapPartitions(f).collect()
         [3, 7]
+
+        .. connect_migration:: Replace rdd.mapPartitions() with a pandas UDF using
+            applyInPandas.
         """
 
         def func(_: int, iterator: Iterable[T]) -> Iterable[U]:
diff --git a/python/pyspark/errors/exceptions/connect.py b/python/pyspark/errors/exceptions/connect.py
@@ -14,6 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+"""
+.. connect:: true
+"""
+
 import grpc
 import json
 from grpc import StatusCode
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
@@ -15,6 +15,10 @@
 # limitations under the License.
 #
 
+"""
+.. classic:: true
+"""
+
 import atexit
 import os
 import signal
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
@@ -1429,6 +1429,10 @@ class LDAModel(JavaModel, _LDAParams):
     including local and distributed data structures.
 
     .. versionadded:: 2.0.0
+
+    .. classic:: true
+
+    .. connect_migration:: LDA model family is not supported in Spark Connect.
     """
 
     @since("3.0.0")
@@ -1530,6 +1534,10 @@ class DistributedLDAModel(LDAModel, JavaMLReadable["DistributedLDAModel"], JavaM
     for each training document.
 
     .. versionadded:: 2.0.0
+
+    .. classic:: true
+
+    .. connect_migration:: LDA model family is not supported in Spark Connect.
     """
 
     @functools.cache
@@ -1608,6 +1616,10 @@ class LocalLDAModel(LDAModel, JavaMLReadable["LocalLDAModel"], JavaMLWritable):
     This model stores the inferred topics only; it does not store info about the training dataset.
 
     .. versionadded:: 2.0.0
+
+    .. classic:: true
+
+    .. connect_migration:: LDA model family is not supported in Spark Connect.
     """
 
     pass
@@ -1682,6 +1694,10 @@ class LDA(JavaEstimator[LDAModel], _LDAParams, JavaMLReadable["LDA"], JavaMLWrit
     >>> sameLocalModel = LocalLDAModel.load(local_model_path)
     >>> model.transform(df).take(1) == sameLocalModel.transform(df).take(1)
     True
+
+    .. classic:: true
+
+    .. connect_migration:: LDA model family is not supported in Spark Connect.
     """
 
     _input_kwargs: Dict[str, Any]
diff --git a/python/pyspark/ml/connect/__init__.py b/python/pyspark/ml/connect/__init__.py
@@ -15,7 +15,11 @@
 # limitations under the License.
 #
 
-"""Spark Connect Python Client - ML module"""
+"""
+Spark Connect Python Client - ML module
+
+.. connect:: true
+"""
 
 from pyspark.sql.connect.utils import check_dependencies
 
diff --git a/python/pyspark/ml/deepspeed/__init__.py b/python/pyspark/ml/deepspeed/__init__.py
@@ -14,3 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+"""
+.. classic:: true
+
+.. connect_migration:: Migrate to pyspark.ml.connect
+"""
diff --git a/python/pyspark/ml/torch/__init__.py b/python/pyspark/ml/torch/__init__.py
@@ -14,3 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+"""
+.. classic:: true
+
+.. connect_migration:: Migrate to pyspark.ml.connect
+"""
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
@@ -15,6 +15,10 @@
 # limitations under the License.
 #
 
+"""
+.. classic:: true
+"""
+
 from abc import ABCMeta, abstractmethod
 from typing import Any, Generic, Optional, List, Type, TypeVar, TYPE_CHECKING
 
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
@@ -20,6 +20,10 @@
 
 The `pyspark.mllib` package is in maintenance mode as of the Spark 2.0.0 release to encourage
 migration to the DataFrame-based APIs under the `pyspark.ml` package.
+
+.. classic:: true
+
+.. connect_migration:: Migrate to pyspark.ml.connect
 """
 
 # MLlib currently needs NumPy 1.4+, so complain if lower
diff --git a/python/pyspark/profiler.py b/python/pyspark/profiler.py
@@ -15,6 +15,10 @@
 # limitations under the License.
 #
 
+"""
+.. classic:: true
+"""
+
 from typing import (
     Any,
     Callable,
diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py
@@ -15,6 +15,10 @@
 # limitations under the License.
 #
 
+"""
+.. classic:: true
+"""
+
 import sys
 import random
 import math
diff --git a/python/pyspark/sql/classic/__init__.py b/python/pyspark/sql/classic/__init__.py
@@ -15,4 +15,8 @@
 # limitations under the License.
 #
 
-"""Spark Classic specific"""
+"""
+Spark Classic specific
+
+.. classic:: true
+"""
diff --git a/python/pyspark/sql/connect/__init__.py b/python/pyspark/sql/connect/__init__.py
@@ -15,7 +15,11 @@
 # limitations under the License.
 #
 
-"""Spark Connect client"""
+"""
+Spark Connect client
+
+.. connect:: true
+"""
 
 from pyspark.sql.connect.utils import check_dependencies
 
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
@@ -15,6 +15,12 @@
 # limitations under the License.
 #
 
+"""
+.. classic:: true
+
+.. connect_migration:: Use SparkSession instead.
+"""
+
 import sys
 import warnings
 from typing import (
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -187,6 +187,9 @@ def rdd(self) -> "RDD[Row]":
             >>> df = spark.range(1)
             >>> type(df.rdd)
             <class 'pyspark.core.rdd.RDD'>
+
+            .. connect_migration:: Use DataFrame operations directly as RDD is not supported
+                in Spark Connect.
             """
             ...
 
diff --git a/python/pyspark/sql/metrics.py b/python/pyspark/sql/metrics.py
@@ -14,6 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+"""
+.. connect:: true
+"""
+
 import abc
 import dataclasses
 from typing import Optional, List, Tuple, Dict, Any, Union, TYPE_CHECKING, Sequence
diff --git a/python/pyspark/sql/profiler.py b/python/pyspark/sql/profiler.py
@@ -14,6 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+"""
+.. classic:: true
+"""
+
 from abc import ABC, abstractmethod
 from io import StringIO
 import cProfile
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -434,6 +434,10 @@ def json(
         +----+---+
         | Bob| 30|
         +----+---+
+
+        .. connect_migration:: Does not support RDD arguments in Spark Connect. Collect the
+            data first: rows = [Row(**json_dict) for json_dict in json_data] then
+            df = spark.createDataFrame(rows).
         """
         self._set_opts(
             schema=schema,
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
diff --git a/python/pyspark/streaming/__init__.py b/python/pyspark/streaming/__init__.py
diff --git a/python/pyspark/testing/mllibutils.py b/python/pyspark/testing/mllibutils.py

Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,7 @@`
`14`	`14`	`# See the License for the specific language governing permissions and`
`15`	`15`	`# limitations under the License.`
`16`	`16`	`#`
	`17`	`+`
	`18`	`+"""`
	`19`	`+.. classic:: true`
	`20`	`+"""`