Skip to content

Commit 7930428

Browse files
authored
Merge branch 'main' into fix_comet_sum_compatbility_level
2 parents 64d97f7 + 8f3cee5 commit 7930428

419 files changed

Lines changed: 6442 additions & 28576 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.asf.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ github:
4141
features:
4242
issues: true
4343
discussions: true
44+
projects: true
4445
protected_branches:
4546
main:
4647
required_pull_request_reviews:

.github/workflows/docker-publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,6 @@ jobs:
7474
with:
7575
platforms: linux/amd64,linux/arm64
7676
push: true
77-
tags: ghcr.io/apache/datafusion-comet:spark-3.5-scala-2.12-${{ env.COMET_VERSION }}
77+
tags: ghcr.io/apache/datafusion-comet:spark-4.1-scala-2.13-${{ env.COMET_VERSION }}
7878
file: kube/Dockerfile
7979
no-cache: true

.github/workflows/iceberg_spark_test.yml

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,14 @@ jobs:
120120
strategy:
121121
matrix:
122122
os: [ubuntu-24.04]
123-
java-version: [11, 17]
124123
iceberg-version: [{short: '1.8', full: '1.8.1'}, {short: '1.9', full: '1.9.1'}, {short: '1.10', full: '1.10.0'}]
125124
spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.8'}]
126125
scala-version: ['2.13']
126+
include:
127+
- spark-version: {short: '3.4', full: '3.4.3'}
128+
java-version: 11
129+
- spark-version: {short: '3.5', full: '3.5.8'}
130+
java-version: 17
127131
fail-fast: false
128132
name: iceberg-spark/${{ matrix.os }}/iceberg-${{ matrix.iceberg-version.full }}/spark-${{ matrix.spark-version.full }}/scala-${{ matrix.scala-version }}/java-${{ matrix.java-version }}
129133
runs-on: ${{ matrix.os }}
@@ -163,10 +167,14 @@ jobs:
163167
strategy:
164168
matrix:
165169
os: [ubuntu-24.04]
166-
java-version: [11, 17]
167170
iceberg-version: [{short: '1.8', full: '1.8.1'}, {short: '1.9', full: '1.9.1'}, {short: '1.10', full: '1.10.0'}]
168171
spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.8'}]
169172
scala-version: ['2.13']
173+
include:
174+
- spark-version: {short: '3.4', full: '3.4.3'}
175+
java-version: 11
176+
- spark-version: {short: '3.5', full: '3.5.8'}
177+
java-version: 17
170178
fail-fast: false
171179
name: iceberg-spark-extensions/${{ matrix.os }}/iceberg-${{ matrix.iceberg-version.full }}/spark-${{ matrix.spark-version.full }}/scala-${{ matrix.scala-version }}/java-${{ matrix.java-version }}
172180
runs-on: ${{ matrix.os }}
@@ -206,10 +214,14 @@ jobs:
206214
strategy:
207215
matrix:
208216
os: [ubuntu-24.04]
209-
java-version: [11, 17]
210217
iceberg-version: [{short: '1.8', full: '1.8.1'}, {short: '1.9', full: '1.9.1'}, {short: '1.10', full: '1.10.0'}]
211218
spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.8'}]
212219
scala-version: ['2.13']
220+
include:
221+
- spark-version: {short: '3.4', full: '3.4.3'}
222+
java-version: 11
223+
- spark-version: {short: '3.5', full: '3.5.8'}
224+
java-version: 17
213225
fail-fast: false
214226
name: iceberg-spark-runtime/${{ matrix.os }}/iceberg-${{ matrix.iceberg-version.full }}/spark-${{ matrix.spark-version.full }}/scala-${{ matrix.scala-version }}/java-${{ matrix.java-version }}
215227
runs-on: ${{ matrix.os }}

.github/workflows/pr_build_linux.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ jobs:
355355
org.apache.comet.exec.CometWindowExecSuite
356356
org.apache.comet.exec.CometJoinSuite
357357
org.apache.comet.CometNativeSuite
358+
org.apache.comet.CometSetOpWithGroupBySuite
358359
org.apache.comet.CometSparkSessionExtensionsSuite
359360
org.apache.spark.CometPluginsSuite
360361
org.apache.spark.CometPluginsDefaultSuite
@@ -370,6 +371,7 @@ jobs:
370371
org.apache.spark.sql.comet.CometTaskMetricsSuite
371372
org.apache.spark.sql.comet.CometDppFallbackRepro3949Suite
372373
org.apache.spark.sql.comet.CometShuffleFallbackStickinessSuite
374+
org.apache.spark.sql.comet.CometDecimalArithmeticViewSuite
373375
org.apache.comet.objectstore.NativeConfigSuite
374376
- name: "expressions"
375377
value: |
@@ -449,6 +451,8 @@ jobs:
449451
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }}
450452
container:
451453
image: amd64/rust
454+
env:
455+
JAVA_TOOL_OPTIONS: --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
452456
steps:
453457
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
454458

@@ -458,7 +462,7 @@ jobs:
458462
uses: ./.github/actions/setup-builder
459463
with:
460464
rust-version: ${{ env.RUST_VERSION }}
461-
jdk-version: 11
465+
jdk-version: 17
462466

463467
- name: Download native library
464468
uses: actions/download-artifact@v8
@@ -503,6 +507,8 @@ jobs:
503507
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }}
504508
container:
505509
image: amd64/rust
510+
env:
511+
JAVA_TOOL_OPTIONS: --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
506512
strategy:
507513
matrix:
508514
join: [sort_merge, broadcast, hash]
@@ -516,7 +522,7 @@ jobs:
516522
uses: ./.github/actions/setup-builder
517523
with:
518524
rust-version: ${{ env.RUST_VERSION }}
519-
jdk-version: 11
525+
jdk-version: 17
520526

521527
- name: Download native library
522528
uses: actions/download-artifact@v8

.github/workflows/pr_build_macos.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ jobs:
194194
org.apache.comet.exec.CometWindowExecSuite
195195
org.apache.comet.exec.CometJoinSuite
196196
org.apache.comet.CometNativeSuite
197+
org.apache.comet.CometSetOpWithGroupBySuite
197198
org.apache.comet.CometSparkSessionExtensionsSuite
198199
org.apache.spark.CometPluginsSuite
199200
org.apache.spark.CometPluginsDefaultSuite
@@ -209,6 +210,7 @@ jobs:
209210
org.apache.spark.sql.comet.CometTaskMetricsSuite
210211
org.apache.spark.sql.comet.CometDppFallbackRepro3949Suite
211212
org.apache.spark.sql.comet.CometShuffleFallbackStickinessSuite
213+
org.apache.spark.sql.comet.CometDecimalArithmeticViewSuite
212214
org.apache.comet.objectstore.NativeConfigSuite
213215
- name: "expressions"
214216
value: |

.github/workflows/spark_sql_test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ jobs:
143143
- {spark-short: '4.0', spark-full: '4.0.2', java: 21, scan-impl: 'auto'}
144144
- {spark-short: '4.1', spark-full: '4.1.1', java: 17, scan-impl: 'auto'}
145145
fail-fast: false
146-
name: spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}
146+
name: spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
147147
# Hive tests stay on the standard GitHub-hosted runner: HiveSparkSubmitSuite
148148
# relies on an Ivy 'local-m2-cache' resolver that the runs-on.com
149149
# ubuntu24-full-x64 image does not provide, so spark-submit fails there.
@@ -192,7 +192,7 @@ jobs:
192192
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
193193
uses: actions/upload-artifact@v7
194194
with:
195-
name: fallback-log-spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}
195+
name: fallback-log-spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
196196
path: "**/fallback.log"
197197

198198
merge-fallback-logs:

common/src/main/java/org/apache/comet/parquet/Native.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ public static native long initRecordBatchReader(
226226
String sessionTimezone,
227227
int batchSize,
228228
boolean caseSensitive,
229+
boolean returnNullStructIfAllFieldsMissing,
229230
Map<String, String> objectStoreOptions,
230231
CometFileKeyUnwrapper keyUnwrapper,
231232
Object metricsNode);

common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,11 @@ public URI pathUri() throws URISyntaxException {
159159
protected boolean isCaseSensitive;
160160
protected boolean useFieldId;
161161
protected boolean ignoreMissingIds;
162+
// SPARK-53535 (Spark 4.1+): when reading a struct whose requested fields are all
163+
// missing in the Parquet file, true returns the entire struct as null (legacy
164+
// pre-4.1 behavior); false preserves the parent struct's nullness from the file
165+
// so non-null parents materialize as a struct of all-null fields.
166+
protected boolean returnNullStructIfAllFieldsMissing = true;
162167
protected StructType partitionSchema;
163168
protected InternalRow partitionValues;
164169
protected PartitionedFile file;
@@ -278,6 +283,7 @@ private NativeBatchReader(AbstractColumnReader[] columnReaders) {
278283
boolean useFieldId,
279284
boolean ignoreMissingIds,
280285
boolean useLegacyDateTimestamp,
286+
boolean returnNullStructIfAllFieldsMissing,
281287
StructType partitionSchema,
282288
InternalRow partitionValues,
283289
Map<String, SQLMetric> metrics,
@@ -290,6 +296,7 @@ private NativeBatchReader(AbstractColumnReader[] columnReaders) {
290296
this.useFieldId = useFieldId;
291297
this.ignoreMissingIds = ignoreMissingIds;
292298
this.useLegacyDateTimestamp = useLegacyDateTimestamp;
299+
this.returnNullStructIfAllFieldsMissing = returnNullStructIfAllFieldsMissing;
293300
this.partitionSchema = partitionSchema;
294301
this.partitionValues = partitionValues;
295302
this.file = inputSplit;
@@ -578,6 +585,7 @@ public void init() throws Throwable {
578585
timeZoneId,
579586
batchSize,
580587
caseSensitive,
588+
returnNullStructIfAllFieldsMissing,
581589
objectStoreOptions,
582590
keyUnwrapper,
583591
metricsNode);

common/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,9 @@ object CometConf extends ShimCometConf {
9494
.createWithEnvVarOrDefault("ENABLE_COMET", true)
9595

9696
val COMET_NATIVE_SCAN_ENABLED: ConfigEntry[Boolean] = conf("spark.comet.scan.enabled")
97-
.category(CATEGORY_SCAN)
98-
.doc(
99-
"Whether to enable native scans. When this is turned on, Spark will use Comet to " +
100-
"read supported data sources (currently only Parquet is supported natively). Note " +
101-
"that to enable native vectorized execution, both this config and " +
102-
"`spark.comet.exec.enabled` need to be enabled.")
97+
.category(CATEGORY_TESTING)
98+
.doc("Whether to enable native scans. Intended for use in Comet's own test suites to " +
99+
"selectively disable native scans; not intended for production use.")
103100
.booleanConf
104101
.createWithDefault(true)
105102

common/src/main/scala/org/apache/comet/parquet/CometParquetUtils.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ object CometParquetUtils {
2929
private val PARQUET_FIELD_ID_READ_ENABLED = "spark.sql.parquet.fieldId.read.enabled"
3030
private val IGNORE_MISSING_PARQUET_FIELD_ID = "spark.sql.parquet.fieldId.read.ignoreMissing"
3131

32+
// Field-metadata key arrow-rs writes when it lifts Parquet field IDs into the Arrow schema
33+
// (`parquet::arrow::PARQUET_FIELD_ID_META_KEY`). Spark's local key for the same concept is
34+
// `parquet.field.id` (`ParquetUtils.FIELD_ID_METADATA_KEY`). The serde translates at the proto
35+
// boundary so the native side can match the same key it gets from arrow-rs.
36+
val PARQUET_FIELD_ID_META_KEY = "PARQUET:field_id"
37+
3238
// Map of encryption configuration key-value pairs that, if present, are only supported with
3339
// these specific values. Generally, these are the default values that won't be present,
3440
// but if they are present we want to check them.

0 commit comments

Comments
 (0)