Spark: Backport #16088 to Spark 3.4, 3.5, 4.0 (#16344)

karuppayya · karuppayya · web-flow · commit 6976e020b894 · 2026-05-14T19:35:48.000-07:00
Co-authored-by: karuppayya &lt;karuppayya1990@gmaiul.com&gt;
diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java
@@ -27,6 +27,7 @@
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.hadoop.Util;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.util.PropertyUtil;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.SparkSession;
@@ -318,6 +319,7 @@ public boolean aggregatePushDownEnabled() {
   public boolean adaptiveSplitSizeEnabled() {
     return confParser
         .booleanConf()
+        .sessionConf(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_ENABLED)
         .tableProperty(TableProperties.ADAPTIVE_SPLIT_SIZE_ENABLED)
         .defaultValue(TableProperties.ADAPTIVE_SPLIT_SIZE_ENABLED_DEFAULT)
         .parse();
@@ -329,6 +331,17 @@ public int parallelism() {
     return Math.max(defaultParallelism, numShufflePartitions);
   }
 
+  public int splitParallelism() {
+    int parallelism =
+        confParser
+            .intConf()
+            .sessionConf(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM)
+            .defaultValue(parallelism())
+            .parse();
+    Preconditions.checkArgument(parallelism > 0, "Split parallelism must be > 0: %s", parallelism);
+    return parallelism;
+  }
+
   public boolean distributedPlanningEnabled() {
     return table instanceof SupportsDistributedScanPlanning distributed
         && distributed.allowDistributedPlanning()
diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java
@@ -111,4 +111,13 @@ private SparkSQLProperties() {}
   public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED =
       "spark.sql.iceberg.async-micro-batch-planning-enabled";
   public static final boolean ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT = false;
+
+  // Controls whether adaptive split sizing is enabled
+  public static final String READ_ADAPTIVE_SPLIT_SIZE_ENABLED =
+      "spark.sql.iceberg.read.adaptive-split-size.enabled";
+
+  // Overrides the parallelism used for adaptive split sizing. When unset, the parallelism
+  // defaults to max(spark.default.parallelism, spark.sql.shuffle.partitions).
+  public static final String READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM =
+      "spark.sql.iceberg.read.adaptive-split-size.parallelism";
 }
diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java
@@ -356,8 +356,17 @@ public CustomMetric[] supportedCustomMetrics() {
   protected long adjustSplitSize(List<? extends ScanTask> tasks, long splitSize) {
     if (readConf.splitSizeOption() == null && readConf.adaptiveSplitSizeEnabled()) {
       long scanSize = tasks.stream().mapToLong(ScanTask::sizeBytes).sum();
-      int parallelism = readConf.parallelism();
-      return TableScanUtil.adjustSplitSize(scanSize, parallelism, splitSize);
+      int parallelism = readConf.splitParallelism();
+      long adjustedSplitSize = TableScanUtil.adjustSplitSize(scanSize, parallelism, splitSize);
+      if (adjustedSplitSize != splitSize) {
+        LOG.debug(
+            "Adjusted split size from {} to {} for table {} with parallelism {}",
+            splitSize,
+            adjustedSplitSize,
+            table().name(),
+            parallelism);
+      }
+      return adjustedSplitSize;
     } else {
       return splitSize;
     }
diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkReadConf.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkReadConf.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
+
+import org.apache.iceberg.ParameterizedTestExtension;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.spark.sql.internal.SQLConf;
+import org.apache.spark.sql.util.CaseInsensitiveStringMap;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.TestTemplate;
+import org.junit.jupiter.api.extension.ExtendWith;
+
+@ExtendWith(ParameterizedTestExtension.class)
+public class TestSparkReadConf extends TestBaseWithCatalog {
+
+  @BeforeEach
+  public void before() {
+    super.before();
+    sql("CREATE TABLE %s (id BIGINT, data STRING) USING iceberg", tableName);
+  }
+
+  @AfterEach
+  public void after() {
+    sql("DROP TABLE IF EXISTS %s", tableName);
+  }
+
+  @TestTemplate
+  public void testSplitParallelismDefault() {
+    Table table = validationCatalog.loadTable(tableIdent);
+    SparkReadConf conf = new SparkReadConf(spark, table, CaseInsensitiveStringMap.empty());
+    assertThat(conf.splitParallelism()).isEqualTo(conf.parallelism());
+  }
+
+  @TestTemplate
+  public void testSplitParallelismSessionConf() {
+    Table table = validationCatalog.loadTable(tableIdent);
+    withSQLConf(
+        ImmutableMap.of(
+            SQLConf.SHUFFLE_PARTITIONS().key(),
+            "999",
+            SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM,
+            "42"),
+        () -> {
+          SparkReadConf conf = new SparkReadConf(spark, table, CaseInsensitiveStringMap.empty());
+          assertThat(conf.splitParallelism()).isEqualTo(42);
+        });
+  }
+
+  @TestTemplate
+  public void testSplitParallelismRejectsZero() {
+    Table table = validationCatalog.loadTable(tableIdent);
+    withSQLConf(
+        ImmutableMap.of(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM, "0"),
+        () -> {
+          SparkReadConf conf = new SparkReadConf(spark, table, CaseInsensitiveStringMap.empty());
+          assertThatIllegalArgumentException()
+              .isThrownBy(conf::splitParallelism)
+              .withMessageContaining("Split parallelism must be > 0");
+        });
+  }
+
+  @TestTemplate
+  public void testSplitParallelismRejectsNegative() {
+    Table table = validationCatalog.loadTable(tableIdent);
+    withSQLConf(
+        ImmutableMap.of(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM, "-5"),
+        () -> {
+          SparkReadConf conf = new SparkReadConf(spark, table, CaseInsensitiveStringMap.empty());
+          assertThatIllegalArgumentException()
+              .isThrownBy(conf::splitParallelism)
+              .withMessageContaining("Split parallelism must be > 0");
+        });
+  }
+}
diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java
@@ -27,6 +27,7 @@
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.hadoop.Util;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.util.PropertyUtil;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.SparkSession;
@@ -314,6 +315,7 @@ public boolean aggregatePushDownEnabled() {
   public boolean adaptiveSplitSizeEnabled() {
     return confParser
         .booleanConf()
+        .sessionConf(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_ENABLED)
         .tableProperty(TableProperties.ADAPTIVE_SPLIT_SIZE_ENABLED)
         .defaultValue(TableProperties.ADAPTIVE_SPLIT_SIZE_ENABLED_DEFAULT)
         .parse();
@@ -325,6 +327,17 @@ public int parallelism() {
     return Math.max(defaultParallelism, numShufflePartitions);
   }
 
+  public int splitParallelism() {
+    int parallelism =
+        confParser
+            .intConf()
+            .sessionConf(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM)
+            .defaultValue(parallelism())
+            .parse();
+    Preconditions.checkArgument(parallelism > 0, "Split parallelism must be > 0: %s", parallelism);
+    return parallelism;
+  }
+
   public boolean distributedPlanningEnabled() {
     return table instanceof SupportsDistributedScanPlanning distributed
         && distributed.allowDistributedPlanning()
diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java
@@ -108,4 +108,13 @@ private SparkSQLProperties() {}
   public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED =
       "spark.sql.iceberg.async-micro-batch-planning-enabled";
   public static final boolean ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT = false;
+
+  // Controls whether adaptive split sizing is enabled
+  public static final String READ_ADAPTIVE_SPLIT_SIZE_ENABLED =
+      "spark.sql.iceberg.read.adaptive-split-size.enabled";
+
+  // Overrides the parallelism used for adaptive split sizing. When unset, the parallelism
+  // defaults to max(spark.default.parallelism, spark.sql.shuffle.partitions).
+  public static final String READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM =
+      "spark.sql.iceberg.read.adaptive-split-size.parallelism";
 }
diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java
@@ -356,8 +356,17 @@ public CustomMetric[] supportedCustomMetrics() {
   protected long adjustSplitSize(List<? extends ScanTask> tasks, long splitSize) {
     if (readConf.splitSizeOption() == null && readConf.adaptiveSplitSizeEnabled()) {
       long scanSize = tasks.stream().mapToLong(ScanTask::sizeBytes).sum();
-      int parallelism = readConf.parallelism();
-      return TableScanUtil.adjustSplitSize(scanSize, parallelism, splitSize);
+      int parallelism = readConf.splitParallelism();
+      long adjustedSplitSize = TableScanUtil.adjustSplitSize(scanSize, parallelism, splitSize);
+      if (adjustedSplitSize != splitSize) {
+        LOG.debug(
+            "Adjusted split size from {} to {} for table {} with parallelism {}",
+            splitSize,
+            adjustedSplitSize,
+            table().name(),
+            parallelism);
+      }
+      return adjustedSplitSize;
     } else {
       return splitSize;
     }
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkReadConf.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkReadConf.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
+
+import org.apache.iceberg.ParameterizedTestExtension;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.spark.sql.internal.SQLConf;
+import org.apache.spark.sql.util.CaseInsensitiveStringMap;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.TestTemplate;
+import org.junit.jupiter.api.extension.ExtendWith;
+
+@ExtendWith(ParameterizedTestExtension.class)
+public class TestSparkReadConf extends TestBaseWithCatalog {
+
+  @BeforeEach
+  public void before() {
+    super.before();
+    sql("CREATE TABLE %s (id BIGINT, data STRING) USING iceberg", tableName);
+  }
+
+  @AfterEach
+  public void after() {
+    sql("DROP TABLE IF EXISTS %s", tableName);
+  }
+
+  @TestTemplate
+  public void testSplitParallelismDefault() {
+    Table table = validationCatalog.loadTable(tableIdent);
+    SparkReadConf conf = new SparkReadConf(spark, table, CaseInsensitiveStringMap.empty());
+    assertThat(conf.splitParallelism()).isEqualTo(conf.parallelism());
+  }
+
+  @TestTemplate
+  public void testSplitParallelismSessionConf() {
+    Table table = validationCatalog.loadTable(tableIdent);
+    withSQLConf(
+        ImmutableMap.of(
+            SQLConf.SHUFFLE_PARTITIONS().key(),
+            "999",
+            SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM,
+            "42"),
+        () -> {
+          SparkReadConf conf = new SparkReadConf(spark, table, CaseInsensitiveStringMap.empty());
+          assertThat(conf.splitParallelism()).isEqualTo(42);
+        });
+  }
+
+  @TestTemplate
+  public void testSplitParallelismRejectsZero() {
+    Table table = validationCatalog.loadTable(tableIdent);
+    withSQLConf(
+        ImmutableMap.of(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM, "0"),
+        () -> {
+          SparkReadConf conf = new SparkReadConf(spark, table, CaseInsensitiveStringMap.empty());
+          assertThatIllegalArgumentException()
+              .isThrownBy(conf::splitParallelism)
+              .withMessageContaining("Split parallelism must be > 0");
+        });
+  }
+
+  @TestTemplate
+  public void testSplitParallelismRejectsNegative() {
+    Table table = validationCatalog.loadTable(tableIdent);
+    withSQLConf(
+        ImmutableMap.of(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM, "-5"),
+        () -> {
+          SparkReadConf conf = new SparkReadConf(spark, table, CaseInsensitiveStringMap.empty());
+          assertThatIllegalArgumentException()
+              .isThrownBy(conf::splitParallelism)
+              .withMessageContaining("Split parallelism must be > 0");
+        });
+  }
+}
diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java
@@ -27,6 +27,7 @@
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.hadoop.Util;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.util.PropertyUtil;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.SparkSession;
@@ -314,6 +315,7 @@ public boolean aggregatePushDownEnabled() {
   public boolean adaptiveSplitSizeEnabled() {
     return confParser
         .booleanConf()
+        .sessionConf(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_ENABLED)
         .tableProperty(TableProperties.ADAPTIVE_SPLIT_SIZE_ENABLED)
         .defaultValue(TableProperties.ADAPTIVE_SPLIT_SIZE_ENABLED_DEFAULT)
         .parse();
@@ -325,6 +327,17 @@ public int parallelism() {
     return Math.max(defaultParallelism, numShufflePartitions);
   }
 
+  public int splitParallelism() {
+    int parallelism =
+        confParser
+            .intConf()
+            .sessionConf(SparkSQLProperties.READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM)
+            .defaultValue(parallelism())
+            .parse();
+    Preconditions.checkArgument(parallelism > 0, "Split parallelism must be > 0: %s", parallelism);
+    return parallelism;
+  }
+
   public boolean distributedPlanningEnabled() {
     return table instanceof SupportsDistributedScanPlanning distributed
         && distributed.allowDistributedPlanning()
diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java
@@ -112,6 +112,15 @@ private SparkSQLProperties() {}
   // Prefix for custom snapshot properties
   public static final String SNAPSHOT_PROPERTY_PREFIX = "spark.sql.iceberg.snapshot-property.";
 
+  // Controls whether adaptive split sizing is enabled
+  public static final String READ_ADAPTIVE_SPLIT_SIZE_ENABLED =
+      "spark.sql.iceberg.read.adaptive-split-size.enabled";
+
+  // Overrides the parallelism used for adaptive split sizing. When unset, the parallelism
+  // defaults to max(spark.default.parallelism, spark.sql.shuffle.partitions).
+  public static final String READ_ADAPTIVE_SPLIT_SIZE_PARALLELISM =
+      "spark.sql.iceberg.read.adaptive-split-size.parallelism";
+
   // Controls whether to shred variant columns during write operations
   public static final String SHRED_VARIANTS = "spark.sql.iceberg.shred-variants";
 
diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java
@@ -356,8 +356,17 @@ public CustomMetric[] supportedCustomMetrics() {
   protected long adjustSplitSize(List<? extends ScanTask> tasks, long splitSize) {
     if (readConf.splitSizeOption() == null && readConf.adaptiveSplitSizeEnabled()) {
       long scanSize = tasks.stream().mapToLong(ScanTask::sizeBytes).sum();
-      int parallelism = readConf.parallelism();
-      return TableScanUtil.adjustSplitSize(scanSize, parallelism, splitSize);
+      int parallelism = readConf.splitParallelism();
+      long adjustedSplitSize = TableScanUtil.adjustSplitSize(scanSize, parallelism, splitSize);
+      if (adjustedSplitSize != splitSize) {
+        LOG.debug(
+            "Adjusted split size from {} to {} for table {} with parallelism {}",
+            splitSize,
+            adjustedSplitSize,
+            table().name(),
+            parallelism);
+      }
+      return adjustedSplitSize;
     } else {
       return splitSize;
     }
diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkReadConf.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkReadConf.java