[SPARK-56340][PYTHON] Move input_type schema to eval conf

gaogaotiantian · zhengruifeng · commit 9dbe381ed0ef · 2026-04-07T12:55:12.000+08:00
### What changes were proposed in this pull request? Use eval conf to pass the schema json, instead of sending a random string before UDF. ### Why are the changes needed? Clean up JVM <-> python worker protocol. We should not randomly pass data. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? `test_udf` passed locally, the rest is on CI. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #55170 from gaogaotiantian/move-input-type. Authored-by: Tian Gao <gaogaotiantian@hotmail.com> Signed-off-by: Ruifeng Zheng <ruifengz@foxmail.com>
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -207,6 +207,13 @@ def state_server_socket_port(self) -> Optional[int | str]:
         except ValueError:
             return port
 
+    @property
+    def input_type(self) -> Optional[DataType]:
+        input_type = self.get("input_type", None, lower_str=False)
+        if input_type is None:
+            return None
+        return _parse_datatype_json_string(input_type)
+
 
 def report_times(outfile, boot, init, finish, processing_time_ms):
     write_int(SpecialLengths.TIMING_DATA, outfile)
@@ -2532,11 +2539,9 @@ def read_udfs(pickleSer, infile, eval_type, runner_conf, eval_conf):
             PythonEvalType.SQL_MAP_ARROW_ITER_UDF,
             PythonEvalType.SQL_SCALAR_ARROW_UDF,
             PythonEvalType.SQL_SCALAR_ARROW_ITER_UDF,
+            PythonEvalType.SQL_ARROW_BATCHED_UDF,
         ):
             ser = ArrowStreamSerializer(write_start_stream=True)
-        elif eval_type == PythonEvalType.SQL_ARROW_BATCHED_UDF:
-            input_type = _parse_datatype_json_string(utf8_deserializer.loads(infile))
-            ser = ArrowStreamSerializer(write_start_stream=True)
         else:
             # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
             # pandas Series. See SPARK-27240.
@@ -2865,7 +2870,7 @@ def grouped_func(
             ArrowTableToRowsConversion._create_converter(
                 f.dataType, none_on_identity=True, binary_as_bytes=runner_conf.binary_as_bytes
             )
-            for f in input_type
+            for f in eval_conf.input_type
         ]
 
         @fail_on_stopiteration
@@ -2968,7 +2973,7 @@ def func(split_index: int, data: Iterator[pa.RecordBatch]) -> Iterator[pa.Record
                 pandas_columns = ArrowBatchTransformer.to_pandas(
                     input_batch,
                     timezone=runner_conf.timezone,
-                    schema=input_type,
+                    schema=eval_conf.input_type,
                     struct_in_pandas="row",
                     ndarray_as_list=True,
                     prefer_int_ext_dtype=runner_conf.prefer_int_ext_dtype,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -136,10 +136,17 @@ class ArrowPythonWithNamedArgumentRunner(
 
   override protected def runnerConf: Map[String, String] = super.runnerConf ++ pythonRunnerConf
 
-  override protected def writeUDF(dataOut: DataOutputStream): Unit = {
+  override protected def evalConf: Map[String, String] = {
     if (evalType == PythonEvalType.SQL_ARROW_BATCHED_UDF) {
-      PythonWorkerUtils.writeUTF(schema.json, dataOut)
+      super.evalConf ++ Map(
+        "input_type" -> schema.json
+      )
+    } else {
+      super.evalConf
     }
+  }
+
+  override protected def writeUDF(dataOut: DataOutputStream): Unit = {
     PythonUDFRunner.writeUDFs(dataOut, funcs, argMetas)
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -136,10 +136,17 @@ class ArrowPythonWithNamedArgumentRunner(`
`136`	`136`
`137`	`137`	`override protected def runnerConf: Map[String, String] = super.runnerConf ++ pythonRunnerConf`
`138`	`138`
`139`		`- override protected def writeUDF(dataOut: DataOutputStream): Unit = {`
	`139`	`+ override protected def evalConf: Map[String, String] = {`
`140`	`140`	`if (evalType == PythonEvalType.SQL_ARROW_BATCHED_UDF) {`
`141`		`- PythonWorkerUtils.writeUTF(schema.json, dataOut)`
	`141`	`+ super.evalConf ++ Map(`
	`142`	`+ "input_type" -> schema.json`
	`143`	`+ )`
	`144`	`+ } else {`
	`145`	`+ super.evalConf`
`142`	`146`	`}`
	`147`	`+ }`
	`148`	`+`
	`149`	`+ override protected def writeUDF(dataOut: DataOutputStream): Unit = {`
`143`	`150`	`PythonUDFRunner.writeUDFs(dataOut, funcs, argMetas)`
`144`	`151`	`}`
`145`	`152`	`}`