From db0c2c42746ab4a3f0b3854512e23ba91f304084 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 20 Jan 2026 15:13:24 -0700
Subject: [PATCH 1/4] feat: add shuffle benchmark variants with native write
 config support

- Add `get_spark_configs()` method to base Benchmark class for
  benchmark-specific Spark configurations
- Common Comet configs (enabled, logging) now defined in Python
  for jvm/native modes
- Add shuffle benchmark variants:
  - shuffle-hash-native-write: hash shuffle with Comet native parquet writes
  - shuffle-hash-spark-write: hash shuffle with Spark parquet writes
  - shuffle-roundrobin-native-write: round-robin with native writes
  - shuffle-roundrobin-spark-write: round-robin with Spark writes
- Add --print-configs CLI option to output benchmark configs
- Refactor run_all_benchmarks.sh to use helper function
- Exclude benchmarks/pyspark/** from CI test workflows

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .github/workflows/iceberg_spark_test.yml  |  2 +
 .github/workflows/pr_build_linux.yml      |  2 +
 .github/workflows/pr_build_macos.yml      |  2 +
 .github/workflows/spark_sql_test.yml      |  2 +
 benchmarks/pyspark/benchmarks/__init__.py | 17 +++-
 benchmarks/pyspark/benchmarks/base.py     | 26 ++++++
 benchmarks/pyspark/benchmarks/shuffle.py  | 83 +++++++++++++++++++
 benchmarks/pyspark/run_all_benchmarks.sh  | 98 ++++++++++-------------
 benchmarks/pyspark/run_benchmark.py       | 31 ++++++-
 9 files changed, 203 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/iceberg_spark_test.yml b/.github/workflows/iceberg_spark_test.yml
index 74badcda5f..90c7eb8de9 100644
--- a/.github/workflows/iceberg_spark_test.yml
+++ b/.github/workflows/iceberg_spark_test.yml
@@ -27,6 +27,7 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "benchmarks/pyspark/**"
       - "native/core/benches/**"
       - "native/spark-expr/benches/**"
       - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
@@ -35,6 +36,7 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "benchmarks/pyspark/**"
       - "native/core/benches/**"
       - "native/spark-expr/benches/**"
       - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
index 8e4dc5124b..544dd30f89 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -27,6 +27,7 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "benchmarks/pyspark/**"
       - "native/core/benches/**"
       - "native/spark-expr/benches/**"
       - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
@@ -35,6 +36,7 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "benchmarks/pyspark/**"
       - "native/core/benches/**"
       - "native/spark-expr/benches/**"
       - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
index f94071dbc7..c62e3c7a75 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -27,6 +27,7 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "benchmarks/pyspark/**"
       - "native/core/benches/**"
       - "native/spark-expr/benches/**"
       - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
@@ -35,6 +36,7 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "benchmarks/pyspark/**"
       - "native/core/benches/**"
       - "native/spark-expr/benches/**"
       - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
index 2fe5fefe1a..b6ef22b4b9 100644
--- a/.github/workflows/spark_sql_test.yml
+++ b/.github/workflows/spark_sql_test.yml
@@ -27,6 +27,7 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "benchmarks/pyspark/**"
       - "native/core/benches/**"
       - "native/spark-expr/benches/**"
       - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
@@ -35,6 +36,7 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "benchmarks/pyspark/**"
       - "native/core/benches/**"
       - "native/spark-expr/benches/**"
       - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
diff --git a/benchmarks/pyspark/benchmarks/__init__.py b/benchmarks/pyspark/benchmarks/__init__.py
index 7d913a7d6d..e7a912c5a6 100644
--- a/benchmarks/pyspark/benchmarks/__init__.py
+++ b/benchmarks/pyspark/benchmarks/__init__.py
@@ -25,13 +25,24 @@
 from typing import Dict, Type, List
 
 from .base import Benchmark
-from .shuffle import ShuffleHashBenchmark, ShuffleRoundRobinBenchmark
+from .shuffle import (
+    ShuffleHashBenchmark,
+    ShuffleRoundRobinBenchmark,
+    ShuffleHashNativeWriteBenchmark,
+    ShuffleHashSparkWriteBenchmark,
+    ShuffleRoundRobinNativeWriteBenchmark,
+    ShuffleRoundRobinSparkWriteBenchmark,
+)
 
 
 # Registry of all available benchmarks
 _BENCHMARK_REGISTRY: Dict[str, Type[Benchmark]] = {
     ShuffleHashBenchmark.name(): ShuffleHashBenchmark,
     ShuffleRoundRobinBenchmark.name(): ShuffleRoundRobinBenchmark,
+    ShuffleHashNativeWriteBenchmark.name(): ShuffleHashNativeWriteBenchmark,
+    ShuffleHashSparkWriteBenchmark.name(): ShuffleHashSparkWriteBenchmark,
+    ShuffleRoundRobinNativeWriteBenchmark.name(): ShuffleRoundRobinNativeWriteBenchmark,
+    ShuffleRoundRobinSparkWriteBenchmark.name(): ShuffleRoundRobinSparkWriteBenchmark,
 }
 
 
@@ -76,4 +87,8 @@ def list_benchmarks() -> List[tuple[str, str]]:
     'list_benchmarks',
     'ShuffleHashBenchmark',
     'ShuffleRoundRobinBenchmark',
+    'ShuffleHashNativeWriteBenchmark',
+    'ShuffleHashSparkWriteBenchmark',
+    'ShuffleRoundRobinNativeWriteBenchmark',
+    'ShuffleRoundRobinSparkWriteBenchmark',
 ]
diff --git a/benchmarks/pyspark/benchmarks/base.py b/benchmarks/pyspark/benchmarks/base.py
index 7e8e8db5a9..57a494158a 100644
--- a/benchmarks/pyspark/benchmarks/base.py
+++ b/benchmarks/pyspark/benchmarks/base.py
@@ -55,6 +55,31 @@ def description(cls) -> str:
         """Return a short description of the benchmark."""
         pass
 
+    @classmethod
+    def get_spark_configs(cls, mode: str) -> Dict[str, str]:
+        """
+        Return Spark configurations required by this benchmark.
+
+        These configs are applied when creating the SparkSession.
+        Subclasses can override to specify benchmark-specific configs.
+        Subclasses should call super().get_spark_configs(mode) and update
+        the returned dict to preserve common configs.
+
+        Args:
+            mode: Execution mode (spark, jvm, native)
+
+        Returns:
+            Dictionary of config key -> value
+        """
+        if mode == "spark":
+            return {}
+        # Common configs for all Comet benchmarks (jvm and native modes)
+        return {
+            "spark.comet.enabled": "true",
+            "spark.comet.logFallbackReasons.enabled": "true",
+            "spark.comet.explainFallback.enabled": "true",
+        }
+
     @abstractmethod
     def run(self) -> Dict[str, Any]:
         """
@@ -109,6 +134,7 @@ def _print_spark_config(self):
         print(f"Comet enabled: {conf.get('spark.comet.enabled', 'false')}")
         print(f"Comet shuffle enabled: {conf.get('spark.comet.exec.shuffle.enabled', 'false')}")
         print(f"Comet shuffle mode: {conf.get('spark.comet.shuffle.mode', 'not set')}")
+        print(f"Comet native write: {conf.get('spark.comet.parquet.write.enabled', 'false')}")
         print(f"Spark UI: {self.spark.sparkContext.uiWebUrl}")
 
     def _time_operation(self, operation_fn):
diff --git a/benchmarks/pyspark/benchmarks/shuffle.py b/benchmarks/pyspark/benchmarks/shuffle.py
index 0facd2340d..f4853a9806 100644
--- a/benchmarks/pyspark/benchmarks/shuffle.py
+++ b/benchmarks/pyspark/benchmarks/shuffle.py
@@ -128,3 +128,86 @@ def description(cls) -> str:
     def _repartition(self, df: DataFrame) -> DataFrame:
         """Repartition using round-robin (no partition columns specified)."""
         return df.repartition(self.num_partitions)
+
+
+# Variants with explicit native write configuration
+
+
+class ShuffleHashNativeWriteBenchmark(ShuffleHashBenchmark):
+    """Shuffle benchmark using hash partitioning with Comet native parquet writes."""
+
+    @classmethod
+    def name(cls) -> str:
+        return "shuffle-hash-native-write"
+
+    @classmethod
+    def description(cls) -> str:
+        return "Hash shuffle with Comet native parquet writes enabled"
+
+    @classmethod
+    def get_spark_configs(cls, mode: str) -> dict:
+        configs = super().get_spark_configs(mode)
+        if mode != "spark":
+            configs.update({
+                "spark.comet.parquet.write.enabled": "true",
+                "spark.comet.operator.DataWritingCommandExec.allowIncompatible": "true",
+            })
+        return configs
+
+
+class ShuffleHashSparkWriteBenchmark(ShuffleHashBenchmark):
+    """Shuffle benchmark using hash partitioning with Spark parquet writes."""
+
+    @classmethod
+    def name(cls) -> str:
+        return "shuffle-hash-spark-write"
+
+    @classmethod
+    def description(cls) -> str:
+        return "Hash shuffle with Comet native parquet writes disabled"
+
+    @classmethod
+    def get_spark_configs(cls, mode: str) -> dict:
+        configs = super().get_spark_configs(mode)
+        configs["spark.comet.parquet.write.enabled"] = "false"
+        return configs
+
+
+class ShuffleRoundRobinNativeWriteBenchmark(ShuffleRoundRobinBenchmark):
+    """Shuffle benchmark using round-robin partitioning with Comet native parquet writes."""
+
+    @classmethod
+    def name(cls) -> str:
+        return "shuffle-roundrobin-native-write"
+
+    @classmethod
+    def description(cls) -> str:
+        return "Round-robin shuffle with Comet native parquet writes enabled"
+
+    @classmethod
+    def get_spark_configs(cls, mode: str) -> dict:
+        configs = super().get_spark_configs(mode)
+        if mode != "spark":
+            configs.update({
+                "spark.comet.parquet.write.enabled": "true",
+                "spark.comet.operator.DataWritingCommandExec.allowIncompatible": "true",
+            })
+        return configs
+
+
+class ShuffleRoundRobinSparkWriteBenchmark(ShuffleRoundRobinBenchmark):
+    """Shuffle benchmark using round-robin partitioning with Spark parquet writes."""
+
+    @classmethod
+    def name(cls) -> str:
+        return "shuffle-roundrobin-spark-write"
+
+    @classmethod
+    def description(cls) -> str:
+        return "Round-robin shuffle with Comet native parquet writes disabled"
+
+    @classmethod
+    def get_spark_configs(cls, mode: str) -> dict:
+        configs = super().get_spark_configs(mode)
+        configs["spark.comet.parquet.write.enabled"] = "false"
+        return configs
diff --git a/benchmarks/pyspark/run_all_benchmarks.sh b/benchmarks/pyspark/run_all_benchmarks.sh
index f2bc8f552f..1a8ef025a7 100755
--- a/benchmarks/pyspark/run_all_benchmarks.sh
+++ b/benchmarks/pyspark/run_all_benchmarks.sh
@@ -45,7 +45,7 @@ echo "========================================"
 
 # Run Spark baseline (no Comet)
 echo ""
-echo ">>> Running SPARK shuffle benchmark..."
+echo ">>> Running SPARK shuffle benchmark (baseline)..."
 $SPARK_HOME/bin/spark-submit \
   --master "$SPARK_MASTER" \
   --executor-memory "$EXECUTOR_MEMORY" \
@@ -55,62 +55,50 @@ $SPARK_HOME/bin/spark-submit \
   --conf spark.comet.exec.shuffle.enabled=false \
   "$SCRIPT_DIR/run_benchmark.py" \
   --data "$DATA_PATH" \
-  --mode spark
+  --mode spark \
+  --benchmark shuffle-hash
 
-# Run Comet JVM shuffle
-echo ""
-echo ">>> Running COMET JVM shuffle benchmark..."
-$SPARK_HOME/bin/spark-submit \
-  --master "$SPARK_MASTER" \
-  --executor-memory "$EXECUTOR_MEMORY" \
-  --jars "$COMET_JAR" \
-  --driver-class-path "$COMET_JAR" \
-  --conf spark.executor.extraClassPath="$COMET_JAR" \
-  --conf spark.eventLog.enabled=true \
-  --conf spark.eventLog.dir="$EVENT_LOG_DIR" \
-  --conf spark.memory.offHeap.enabled=true \
-  --conf spark.memory.offHeap.size=16g \
-  --conf spark.comet.enabled=true \
-  --conf spark.comet.operator.DataWritingCommandExec.allowIncompatible=true \
-  --conf spark.comet.parquet.write.enabled=true \
-  --conf spark.comet.logFallbackReasons.enabled=true \
-  --conf spark.comet.explainFallback.enabled=true \
-  --conf spark.comet.shuffle.mode=jvm \
-  --conf spark.comet.exec.shuffle.mode=jvm \
-  --conf spark.comet.exec.replaceSortMergeJoin=true \
-  --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
-  --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
-  --conf spark.comet.cast.allowIncompatible=true \
-  "$SCRIPT_DIR/run_benchmark.py" \
-  --data "$DATA_PATH" \
-  --mode jvm
+# Helper function to run a Comet benchmark
+# Usage: run_comet_benchmark <mode> <benchmark_name>
+run_comet_benchmark() {
+  local shuffle_mode=$1
+  local benchmark=$2
 
-# Run Comet Native shuffle
-echo ""
-echo ">>> Running COMET NATIVE shuffle benchmark..."
-$SPARK_HOME/bin/spark-submit \
-  --master "$SPARK_MASTER" \
-  --executor-memory "$EXECUTOR_MEMORY" \
-  --jars "$COMET_JAR" \
-  --driver-class-path "$COMET_JAR" \
-  --conf spark.executor.extraClassPath="$COMET_JAR" \
-  --conf spark.eventLog.enabled=true \
-  --conf spark.eventLog.dir="$EVENT_LOG_DIR" \
-  --conf spark.memory.offHeap.enabled=true \
-  --conf spark.memory.offHeap.size=16g \
-  --conf spark.comet.enabled=true \
-  --conf spark.comet.operator.DataWritingCommandExec.allowIncompatible=true \
-  --conf spark.comet.parquet.write.enabled=true \
-  --conf spark.comet.logFallbackReasons.enabled=true \
-  --conf spark.comet.explainFallback.enabled=true \
-  --conf spark.comet.exec.shuffle.mode=native \
-  --conf spark.comet.exec.replaceSortMergeJoin=true \
-  --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
-  --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
-  --conf spark.comet.cast.allowIncompatible=true \
-  "$SCRIPT_DIR/run_benchmark.py" \
-  --data "$DATA_PATH" \
-  --mode native
+  echo ""
+  echo ">>> Running COMET $shuffle_mode shuffle benchmark: $benchmark..."
+
+  $SPARK_HOME/bin/spark-submit \
+    --master "$SPARK_MASTER" \
+    --executor-memory "$EXECUTOR_MEMORY" \
+    --jars "$COMET_JAR" \
+    --driver-class-path "$COMET_JAR" \
+    --conf spark.executor.extraClassPath="$COMET_JAR" \
+    --conf spark.eventLog.enabled=true \
+    --conf spark.eventLog.dir="$EVENT_LOG_DIR" \
+    --conf spark.memory.offHeap.enabled=true \
+    --conf spark.memory.offHeap.size=16g \
+    --conf spark.comet.exec.shuffle.mode="$shuffle_mode" \
+    --conf spark.comet.exec.replaceSortMergeJoin=true \
+    --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
+    --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
+    --conf spark.comet.cast.allowIncompatible=true \
+    "$SCRIPT_DIR/run_benchmark.py" \
+    --data "$DATA_PATH" \
+    --mode "$shuffle_mode" \
+    --benchmark "$benchmark"
+}
+
+# Run Comet JVM shuffle with native writes
+run_comet_benchmark jvm shuffle-hash-native-write
+
+# Run Comet JVM shuffle with Spark writes
+run_comet_benchmark jvm shuffle-hash-spark-write
+
+# Run Comet Native shuffle with native writes
+run_comet_benchmark native shuffle-hash-native-write
+
+# Run Comet Native shuffle with Spark writes
+run_comet_benchmark native shuffle-hash-spark-write
 
 echo ""
 echo "========================================"
diff --git a/benchmarks/pyspark/run_benchmark.py b/benchmarks/pyspark/run_benchmark.py
index 6713f0ff21..805e1f5290 100755
--- a/benchmarks/pyspark/run_benchmark.py
+++ b/benchmarks/pyspark/run_benchmark.py
@@ -66,6 +66,11 @@ def main():
         action="store_true",
         help="List all available benchmarks and exit"
     )
+    parser.add_argument(
+        "--print-configs",
+        action="store_true",
+        help="Print benchmark-specific Spark configs and exit (for shell scripts)"
+    )
 
     args = parser.parse_args()
 
@@ -76,6 +81,20 @@ def main():
             print(f"  {name:25s} - {description}")
         return 0
 
+    # Handle --print-configs (requires --benchmark and --mode)
+    if args.print_configs:
+        if not args.mode:
+            parser.error("--mode is required when using --print-configs")
+        try:
+            benchmark_cls = get_benchmark(args.benchmark)
+        except KeyError as e:
+            print(f"Error: {e}", file=sys.stderr)
+            return 1
+        configs = benchmark_cls.get_spark_configs(args.mode)
+        for key, value in configs.items():
+            print(f"--conf {key}={value}")
+        return 0
+
     # Validate required arguments
     if not args.data:
         parser.error("--data is required when running a benchmark")
@@ -90,10 +109,14 @@ def main():
         print("\nUse --list-benchmarks to see available benchmarks", file=sys.stderr)
         return 1
 
-    # Create Spark session
-    spark = SparkSession.builder \
-        .appName(f"{benchmark_cls.name()}-{args.mode.upper()}") \
-        .getOrCreate()
+    # Get benchmark-specific configs
+    benchmark_configs = benchmark_cls.get_spark_configs(args.mode)
+
+    # Create Spark session with benchmark-specific configs
+    builder = SparkSession.builder.appName(f"{benchmark_cls.name()}-{args.mode.upper()}")
+    for key, value in benchmark_configs.items():
+        builder = builder.config(key, value)
+    spark = builder.getOrCreate()
 
     try:
         # Create and run the benchmark

From 00a58efc252aca28d1d60281c91d6596497ec8d3 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 20 Jan 2026 15:18:54 -0700
Subject: [PATCH 2/4] refactor: rename --mode to --shuffle-mode for clarity

---
 benchmarks/pyspark/run_all_benchmarks.sh |  4 ++--
 benchmarks/pyspark/run_benchmark.py      | 24 ++++++++++++------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/benchmarks/pyspark/run_all_benchmarks.sh b/benchmarks/pyspark/run_all_benchmarks.sh
index 1a8ef025a7..239f2abee0 100755
--- a/benchmarks/pyspark/run_all_benchmarks.sh
+++ b/benchmarks/pyspark/run_all_benchmarks.sh
@@ -55,7 +55,7 @@ $SPARK_HOME/bin/spark-submit \
   --conf spark.comet.exec.shuffle.enabled=false \
   "$SCRIPT_DIR/run_benchmark.py" \
   --data "$DATA_PATH" \
-  --mode spark \
+  --shuffle-mode spark \
   --benchmark shuffle-hash
 
 # Helper function to run a Comet benchmark
@@ -84,7 +84,7 @@ run_comet_benchmark() {
     --conf spark.comet.cast.allowIncompatible=true \
     "$SCRIPT_DIR/run_benchmark.py" \
     --data "$DATA_PATH" \
-    --mode "$shuffle_mode" \
+    --shuffle-mode "$shuffle_mode" \
     --benchmark "$benchmark"
 }
 
diff --git a/benchmarks/pyspark/run_benchmark.py b/benchmarks/pyspark/run_benchmark.py
index 805e1f5290..99b2c99b56 100755
--- a/benchmarks/pyspark/run_benchmark.py
+++ b/benchmarks/pyspark/run_benchmark.py
@@ -38,10 +38,10 @@ def main():
         epilog="""
 Examples:
   # Run hash partitioning shuffle benchmark in Spark mode
-  python run_benchmark.py --data /path/to/data --mode spark --benchmark shuffle-hash
+  python run_benchmark.py --data /path/to/data --shuffle-mode spark --benchmark shuffle-hash
 
   # Run round-robin shuffle benchmark in Comet native mode
-  python run_benchmark.py --data /path/to/data --mode native --benchmark shuffle-roundrobin
+  python run_benchmark.py --data /path/to/data --shuffle-mode native --benchmark shuffle-roundrobin
 
   # List all available benchmarks
   python run_benchmark.py --list-benchmarks
@@ -52,7 +52,7 @@ def main():
         help="Path to input parquet data"
     )
     parser.add_argument(
-        "--mode", "-m",
+        "--shuffle-mode", "-m",
         choices=["spark", "jvm", "native"],
         help="Shuffle mode being tested"
     )
@@ -81,16 +81,16 @@ def main():
             print(f"  {name:25s} - {description}")
         return 0
 
-    # Handle --print-configs (requires --benchmark and --mode)
+    # Handle --print-configs (requires --benchmark and --shuffle-mode)
     if args.print_configs:
-        if not args.mode:
-            parser.error("--mode is required when using --print-configs")
+        if not args.shuffle_mode:
+            parser.error("--shuffle-mode is required when using --print-configs")
         try:
             benchmark_cls = get_benchmark(args.benchmark)
         except KeyError as e:
             print(f"Error: {e}", file=sys.stderr)
             return 1
-        configs = benchmark_cls.get_spark_configs(args.mode)
+        configs = benchmark_cls.get_spark_configs(args.shuffle_mode)
         for key, value in configs.items():
             print(f"--conf {key}={value}")
         return 0
@@ -98,8 +98,8 @@ def main():
     # Validate required arguments
     if not args.data:
         parser.error("--data is required when running a benchmark")
-    if not args.mode:
-        parser.error("--mode is required when running a benchmark")
+    if not args.shuffle_mode:
+        parser.error("--shuffle-mode is required when running a benchmark")
 
     # Get the benchmark class
     try:
@@ -110,17 +110,17 @@ def main():
         return 1
 
     # Get benchmark-specific configs
-    benchmark_configs = benchmark_cls.get_spark_configs(args.mode)
+    benchmark_configs = benchmark_cls.get_spark_configs(args.shuffle_mode)
 
     # Create Spark session with benchmark-specific configs
-    builder = SparkSession.builder.appName(f"{benchmark_cls.name()}-{args.mode.upper()}")
+    builder = SparkSession.builder.appName(f"{benchmark_cls.name()}-{args.shuffle_mode.upper()}")
     for key, value in benchmark_configs.items():
         builder = builder.config(key, value)
     spark = builder.getOrCreate()
 
     try:
         # Create and run the benchmark
-        benchmark = benchmark_cls(spark, args.data, args.mode)
+        benchmark = benchmark_cls(spark, args.data, args.shuffle_mode)
         results = benchmark.execute_timed()
 
         print("\nCheck Spark UI for shuffle sizes and detailed metrics")

From c8f839e3db3b1481438dc67041629fb3327bbe7f Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 20 Jan 2026 15:36:39 -0700
Subject: [PATCH 3/4] chore: add round robin benchmarks to
 run_all_benchmarks.sh

---
 benchmarks/pyspark/run_all_benchmarks.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/pyspark/run_all_benchmarks.sh b/benchmarks/pyspark/run_all_benchmarks.sh
index 239f2abee0..b9016819bf 100755
--- a/benchmarks/pyspark/run_all_benchmarks.sh
+++ b/benchmarks/pyspark/run_all_benchmarks.sh
@@ -90,15 +90,19 @@ run_comet_benchmark() {
 
 # Run Comet JVM shuffle with native writes
 run_comet_benchmark jvm shuffle-hash-native-write
+run_comet_benchmark jvm shuffle-roundrobin-native-write
 
 # Run Comet JVM shuffle with Spark writes
 run_comet_benchmark jvm shuffle-hash-spark-write
+run_comet_benchmark jvm shuffle-roundrobin-spark-write
 
 # Run Comet Native shuffle with native writes
 run_comet_benchmark native shuffle-hash-native-write
+run_comet_benchmark native shuffle-roundrobin-native-write
 
 # Run Comet Native shuffle with Spark writes
 run_comet_benchmark native shuffle-hash-spark-write
+run_comet_benchmark native shuffle-roundrobin-spark-write
 
 echo ""
 echo "========================================"

From fec2c9cf38008cd2dbd69d1c1ba5d86369ab2755 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 20 Jan 2026 15:38:55 -0700
Subject: [PATCH 4/4] chore: use jvm-shuffle and native-shuffle in job names

---
 benchmarks/pyspark/run_all_benchmarks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/pyspark/run_all_benchmarks.sh b/benchmarks/pyspark/run_all_benchmarks.sh
index b9016819bf..f8bab8e30d 100755
--- a/benchmarks/pyspark/run_all_benchmarks.sh
+++ b/benchmarks/pyspark/run_all_benchmarks.sh
@@ -65,7 +65,7 @@ run_comet_benchmark() {
   local benchmark=$2
 
   echo ""
-  echo ">>> Running COMET $shuffle_mode shuffle benchmark: $benchmark..."
+  echo ">>> Running COMET $shuffle_mode-shuffle benchmark: $benchmark..."
 
   $SPARK_HOME/bin/spark-submit \
     --master "$SPARK_MASTER" \