Skip to content

Commit e763507

Browse files
committed
chore: remove config option for native_iceberg_compat
COMET_NATIVE_SCAN_IMPL config now only allows auto or native_datafusion. CometScanRule always uses native_datafusion. Tests no longer parameterize on scan impl and iceberg_compat-specific tests are removed. Golden files are regenerated in a follow-up commit.
1 parent 53d65cb commit e763507

17 files changed

Lines changed: 166 additions & 383 deletions

File tree

.github/actions/java-test/action.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,6 @@ inputs:
2929
description: 'Maven options passed to the mvn command'
3030
required: false
3131
default: ''
32-
scan_impl:
33-
description: 'The default Parquet scan implementation'
34-
required: false
35-
default: 'auto'
3632
upload-test-reports:
3733
description: 'Whether to upload test results including coverage to GitHub'
3834
required: false
@@ -72,7 +68,6 @@ runs:
7268
shell: bash
7369
if: ${{ inputs.suites == '' }}
7470
env:
75-
COMET_PARQUET_SCAN_IMPL: ${{ inputs.scan_impl }}
7671
SPARK_LOCAL_HOSTNAME: "localhost"
7772
SPARK_LOCAL_IP: "127.0.0.1"
7873
run: |
@@ -81,7 +76,6 @@ runs:
8176
shell: bash
8277
if: ${{ inputs.suites != '' }}
8378
env:
84-
COMET_PARQUET_SCAN_IMPL: ${{ inputs.scan_impl }}
8579
SPARK_LOCAL_HOSTNAME: "localhost"
8680
SPARK_LOCAL_IP: "127.0.0.1"
8781
run: |

.github/workflows/pr_build_linux.yml

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -285,32 +285,26 @@ jobs:
285285
- name: "Spark 3.4, JDK 11, Scala 2.12"
286286
java_version: "11"
287287
maven_opts: "-Pspark-3.4 -Pscala-2.12"
288-
scan_impl: "auto"
289288

290289
- name: "Spark 3.5, JDK 17, Scala 2.13"
291290
java_version: "17"
292291
maven_opts: "-Pspark-3.5 -Pscala-2.13"
293-
scan_impl: "native_iceberg_compat"
294292

295293
- name: "Spark 4.0, JDK 17"
296294
java_version: "17"
297295
maven_opts: "-Pspark-4.0"
298-
scan_impl: "auto"
299296

300297
- name: "Spark 4.0, JDK 21"
301298
java_version: "21"
302299
maven_opts: "-Pspark-4.0"
303-
scan_impl: "auto"
304300

305301
- name: "Spark 4.1, JDK 17"
306302
java_version: "17"
307303
maven_opts: "-Pspark-4.1"
308-
scan_impl: "auto"
309304

310305
- name: "Spark 4.2, JDK 17"
311306
java_version: "17"
312307
maven_opts: "-Pspark-4.2"
313-
scan_impl: "auto"
314308
suite:
315309
- name: "fuzz"
316310
value: |
@@ -399,7 +393,7 @@ jobs:
399393
org.apache.spark.sql.CometToPrettyStringSuite
400394
org.apache.spark.sql.CometCollationSuite
401395
fail-fast: false
402-
name: ${{ matrix.profile.name }}/${{ matrix.profile.scan_impl }} [${{ matrix.suite.name }}]
396+
name: ${{ matrix.profile.name }} [${{ matrix.suite.name }}]
403397
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }}
404398
container:
405399
image: amd64/rust
@@ -437,10 +431,9 @@ jobs:
437431
- name: Java test steps
438432
uses: ./.github/actions/java-test
439433
with:
440-
artifact_name: ${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}-${{ matrix.profile.scan_impl }}
434+
artifact_name: ${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
441435
suites: ${{ matrix.suite.name == 'sql' && matrix.profile.name == 'Spark 3.4, JDK 11, Scala 2.12' && '' || matrix.suite.value }}
442436
maven_opts: ${{ matrix.profile.maven_opts }}
443-
scan_impl: ${{ matrix.profile.scan_impl }}
444437
upload-test-reports: true
445438
skip-native-build: true
446439

.github/workflows/spark_sql_test.yml

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -132,16 +132,14 @@ jobs:
132132
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
133133
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
134134
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
135-
# Since 4f5eaf0, auto mode uses native_datafusion for V1 scans,
136-
# so we only need to test with auto.
137135
config:
138-
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'auto'}
139-
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'auto'}
140-
- {spark-short: '4.0', spark-full: '4.0.2', java: 17, scan-impl: 'auto'}
141-
- {spark-short: '4.0', spark-full: '4.0.2', java: 21, scan-impl: 'auto'}
142-
- {spark-short: '4.1', spark-full: '4.1.1', java: 17, scan-impl: 'auto'}
136+
- {spark-short: '3.4', spark-full: '3.4.3', java: 11}
137+
- {spark-short: '3.5', spark-full: '3.5.8', java: 11}
138+
- {spark-short: '4.0', spark-full: '4.0.2', java: 17}
139+
- {spark-short: '4.0', spark-full: '4.0.2', java: 21}
140+
- {spark-short: '4.1', spark-full: '4.1.1', java: 17}
143141
fail-fast: false
144-
name: spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
142+
name: spark-sql-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
145143
# Hive tests stay on the standard GitHub-hosted runner: HiveSparkSubmitSuite
146144
# relies on an Ivy 'local-m2-cache' resolver that the runs-on.com
147145
# ubuntu24-full-x64 image does not provide, so spark-submit fails there.
@@ -171,7 +169,7 @@ jobs:
171169
run: |
172170
cd apache-spark
173171
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
174-
NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true COMET_PARQUET_SCAN_IMPL=${{ matrix.config.scan-impl }} ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
172+
NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
175173
build/sbt -Dsbt.log.noformat=true -mem 6144 ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
176174
if [ "${{ github.event.inputs.collect-fallback-logs }}" = "true" ]; then
177175
find . -type f -name "unit-tests.log" -print0 | xargs -0 grep -h "Comet cannot accelerate" | sed 's/.*Comet cannot accelerate/Comet cannot accelerate/' | sort -u > fallback.log
@@ -190,7 +188,7 @@ jobs:
190188
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
191189
uses: actions/upload-artifact@v7
192190
with:
193-
name: fallback-log-spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
191+
name: fallback-log-spark-sql-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
194192
path: "**/fallback.log"
195193

196194
merge-fallback-logs:

.github/workflows/spark_sql_test_native_iceberg_compat.yml

Lines changed: 0 additions & 72 deletions
This file was deleted.

common/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -111,22 +111,23 @@ object CometConf extends ShimCometConf {
111111
.booleanConf
112112
.createWithEnvVarOrDefault("ENABLE_COMET_WRITE", false)
113113

114+
@deprecated
114115
val SCAN_NATIVE_DATAFUSION = "native_datafusion"
116+
117+
@deprecated
115118
val SCAN_NATIVE_ICEBERG_COMPAT = "native_iceberg_compat"
119+
120+
@deprecated
116121
val SCAN_AUTO = "auto"
117122

123+
@deprecated
118124
val COMET_NATIVE_SCAN_IMPL: ConfigEntry[String] = conf("spark.comet.scan.impl")
119-
.category(CATEGORY_PARQUET)
120-
.doc(
121-
"The implementation of Comet's Parquet scan to use. Available scans are " +
122-
s"`$SCAN_NATIVE_DATAFUSION`, and `$SCAN_NATIVE_ICEBERG_COMPAT`. " +
123-
s"`$SCAN_NATIVE_DATAFUSION` is a fully native implementation, and " +
124-
s"`$SCAN_NATIVE_ICEBERG_COMPAT` is a hybrid implementation that supports some " +
125-
"additional features, such as row indexes and field ids. " +
126-
s"`$SCAN_AUTO` (default) chooses the best available scan based on the scan schema.")
125+
.category(CATEGORY_TESTING)
126+
.internal()
127+
.doc("This configuration option is deprecated and has no effect on Comet behavior.")
127128
.stringConf
128129
.transform(_.toLowerCase(Locale.ROOT))
129-
.checkValues(Set(SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT, SCAN_AUTO))
130+
.checkValues(Set(SCAN_NATIVE_DATAFUSION, SCAN_AUTO))
130131
.createWithEnvVarOrDefault("COMET_PARQUET_SCAN_IMPL", SCAN_AUTO)
131132

132133
val COMET_ICEBERG_NATIVE_ENABLED: ConfigEntry[Boolean] =

docs/source/contributor-guide/bug_triage.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ help contributors find bugs in their area of expertise.
7373
| `area:ffi` | Arrow FFI / JNI boundary |
7474
| `area:ci` | CI/CD, GitHub Actions, build tooling |
7575

76-
The following pre-existing labels also serve as area indicators: `native_datafusion`,
77-
`native_iceberg_compat`, `spark 4`, `spark sql tests`.
76+
The following pre-existing labels also serve as area indicators: `spark 4`, `spark sql tests`.
7877

7978
## Triage Process
8079

@@ -109,9 +108,8 @@ Periodically review open bugs to ensure priorities are still accurate:
109108
crashes, because crashes are at least visible.
110109
2. **User-reported over test-only.** A bug hit by a real user on a real workload takes priority
111110
over one found only in test suites.
112-
3. **Core path over experimental.** Bugs in the default scan mode (`native_comet`) or widely-used
113-
expressions take priority over bugs in experimental features like `native_datafusion` or
114-
`native_iceberg_compat`.
111+
3. **Core path over experimental.** Bugs in widely-used expressions and operators take priority over
112+
bugs in experimental features.
115113
4. **Production safety over feature completeness.** Fixing a data corruption bug is more important
116114
than adding support for a new expression.
117115

docs/source/user-guide/latest/datasources.md

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,9 @@ Comet supports most standard storage systems, such as local file system and obje
6161

6262
Apache DataFusion Comet native reader seamlessly scans files from remote HDFS for [supported formats](#supported-spark-data-sources)
6363

64-
### Using experimental native DataFusion reader
64+
### Building Comet with HDFS support
6565

66-
Unlike to native Comet reader the Datafusion reader fully supports nested types processing. This reader is currently experimental only
67-
68-
To build Comet with native DataFusion reader and remote HDFS support it is required to have a JDK installed
66+
To build Comet with remote HDFS support it is required to have a JDK installed.
6967

7068
Example:
7169
Build a Comet for `spark-4.1` provide a JDK path in `JAVA_HOME`
@@ -76,11 +74,10 @@ export JAVA_HOME="/opt/homebrew/opt/openjdk@17"
7674
make release PROFILES="-Pspark-4.1" COMET_FEATURES=hdfs RUSTFLAGS="-L $JAVA_HOME/libexec/openjdk.jdk/Contents/Home/lib/server"
7775
```
7876

79-
Start Comet with experimental reader and HDFS support as [described](installation.md/#run-spark-shell-with-comet-enabled)
77+
Start Comet with HDFS support as [described](installation.md/#run-spark-shell-with-comet-enabled)
8078
and add additional parameters
8179

8280
```shell
83-
--conf spark.comet.scan.impl=native_datafusion \
8481
--conf spark.hadoop.fs.defaultFS="hdfs://namenode:9000" \
8582
--conf spark.hadoop.dfs.client.use.datanode.hostname = true \
8683
--conf dfs.client.use.datanode.hostname = true
@@ -158,7 +155,6 @@ JAVA_HOME="/opt/homebrew/opt/openjdk@17" make release PROFILES="-Pspark-4.1" COM
158155
withSQLConf(
159156
CometConf.COMET_ENABLED.key -> "true",
160157
CometConf.COMET_EXEC_ENABLED.key -> "true",
161-
CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_DATAFUSION,
162158
SQLConf.USE_V1_SOURCE_LIST.key -> "parquet",
163159
"fs.defaultFS" -> "hdfs://namenode:9000",
164160
"dfs.client.use.datanode.hostname" -> "true") {
@@ -169,7 +165,7 @@ JAVA_HOME="/opt/homebrew/opt/openjdk@17" make release PROFILES="-Pspark-4.1" COM
169165
}
170166
```
171167

172-
Or use `spark-shell` with HDFS support as described [above](#using-experimental-native-datafusion-reader)
168+
Or use `spark-shell` with HDFS support as described [above](#building-comet-with-hdfs-support)
173169

174170
## S3
175171

spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,7 @@ case class CometScanRule(session: SparkSession)
183183
return scanExec
184184
}
185185

186-
COMET_NATIVE_SCAN_IMPL.get() match {
187-
case SCAN_AUTO | SCAN_NATIVE_DATAFUSION =>
188-
nativeDataFusionScan(plan, session, scanExec, r, hadoopConf).getOrElse(scanExec)
189-
case SCAN_NATIVE_ICEBERG_COMPAT =>
190-
nativeIcebergCompatScan(session, scanExec, r, hadoopConf).getOrElse(scanExec)
191-
}
186+
nativeDataFusionScan(plan, session, scanExec, r, hadoopConf).getOrElse(scanExec)
192187

193188
case _ =>
194189
withInfo(scanExec, s"Unsupported relation ${scanExec.relation}")
@@ -242,21 +237,6 @@ case class CometScanRule(session: SparkSession)
242237
Some(CometScanExec(scanExec, session, SCAN_NATIVE_DATAFUSION))
243238
}
244239

245-
private def nativeIcebergCompatScan(
246-
session: SparkSession,
247-
scanExec: FileSourceScanExec,
248-
r: HadoopFsRelation,
249-
hadoopConf: Configuration): Option[SparkPlan] = {
250-
if (encryptionEnabled(hadoopConf) && !isEncryptionConfigSupported(hadoopConf)) {
251-
withInfo(scanExec, s"$SCAN_NATIVE_ICEBERG_COMPAT does not support encryption")
252-
return None
253-
}
254-
if (!isSchemaSupported(scanExec, SCAN_NATIVE_ICEBERG_COMPAT, r)) {
255-
return None
256-
}
257-
Some(CometScanExec(scanExec, session, SCAN_NATIVE_ICEBERG_COMPAT))
258-
}
259-
260240
private def transformV2Scan(scanExec: BatchScanExec): SparkPlan = {
261241

262242
scanExec.scan match {
@@ -799,8 +779,7 @@ object CometScanRule extends Logging {
799779
Native.validateObjectStoreConfig(filePath, objectStoreOptions)
800780
} catch {
801781
case e: CometNativeException =>
802-
val reason = "Object store config not supported by " +
803-
s"$SCAN_NATIVE_ICEBERG_COMPAT: ${e.getMessage}"
782+
val reason = s"Object store config not supported: ${e.getMessage}"
804783
fallbackReasons += reason
805784
configValidityMap.put(cacheKey, Some(reason))
806785
}

spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,7 @@ class CometArrayExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelp
985985

986986
test("size - respect to legacySizeOfNull") {
987987
val table = "t1"
988-
withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_ICEBERG_COMPAT) {
988+
withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_DATAFUSION) {
989989
withTable(table) {
990990
sql(s"create table $table(col array<string>) using parquet")
991991
sql(s"insert into $table values(null)")

spark/src/test/scala/org/apache/comet/CometCsvExpressionSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class CometCsvExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper
7171
test("to_csv - with configurable formatting options") {
7272
val table = "t1"
7373
withSQLConf(
74-
CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_ICEBERG_COMPAT,
74+
CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_DATAFUSION,
7575
CometConf.getExprAllowIncompatConfigKey(classOf[StructsToCsv]) -> "true") {
7676
withTable(table) {
7777
val newLinesStr =

0 commit comments

Comments
 (0)