Skip to content

Commit 948f3b9

Browse files
committed
better input fuzz coverage
1 parent 965c2ba commit 948f3b9

1 file changed

Lines changed: 68 additions & 0 deletions

File tree

spark/src/test/scala/org/apache/comet/CometCodegenDispatchFuzzSuite.scala

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,74 @@ class CometCodegenDispatchFuzzSuite extends CometTestBase with AdaptiveSparkPlan
220220
}
221221
}
222222

223+
/**
224+
* Element-level fuzz for nested array reads. For every `Array<primitive>` column in the random
225+
* schema, runs `id_X(array_max(col))` so Spark's `ArrayMax.doGenCode` walks every element of
226+
* every row and calls the kernel's nested element getter
227+
* (`getInt`/`getLong`/`getDecimal`/etc.). The cardinality probe deliberately avoids element
228+
* materialization, so without this test no fuzz coverage exists on the element-getter paths the
229+
* unsafe-access optimization would touch. `array_max` is comparison-only on every primitive
230+
* Spark supports, so one expression covers all 14 element types.
231+
*/
232+
test("array_max element fuzz: every Array<primitive> column") {
233+
val df = spark.read.parquet(mixedTypesFilename)
234+
df.createOrReplaceTempView("t1")
235+
val arrayPrimitiveFields = df.schema.fields.filter {
236+
case StructField(_, ArrayType(elemDt, _), _, _) if !isComplexType(elemDt) => true
237+
case _ => false
238+
}
239+
assert(
240+
arrayPrimitiveFields.nonEmpty,
241+
"expected at least one Array<primitive> column in random schema")
242+
for (field <- arrayPrimitiveFields) {
243+
val ArrayType(elemDt, _) = field.dataType: @unchecked
244+
val udfName = s"id_arrmax_${field.name}"
245+
registerIdentityUdfFor(elemDt, udfName) match {
246+
case Some(_) =>
247+
assertCodegenRan {
248+
checkSparkAnswerAndOperator(s"SELECT $udfName(array_max(${field.name})) FROM t1")
249+
}
250+
case None =>
251+
fail(
252+
s"array column ${field.name} elem ${elemDt} not in identity UDF catalog; " +
253+
"extend registerIdentityUdfFor")
254+
}
255+
}
256+
}
257+
258+
/**
259+
* Element-level fuzz for map key and value reads. `map_keys(col)` / `map_values(col)` produce
260+
* arrays the kernel walks via Spark's `ArrayMax`, exercising the map's child key/value getter.
261+
* The leaf primitive read is structurally the same as in the array element fuzz, but the parent
262+
* offset chain (MapVector -> entries StructVector -> child) differs, so a buggy unsafe getter
263+
* that mishandled the map's per-row offset would slip past the array test alone. Filters to
264+
* top-level `Map<primitive, primitive>` columns from the random nested schema.
265+
*/
266+
test("array_max element fuzz: map_keys / map_values on Map<primitive, primitive> columns") {
267+
val df = spark.read.parquet(nestedTypesFilename)
268+
df.createOrReplaceTempView("t2")
269+
val mapPrimitiveFields = df.schema.fields.filter {
270+
case StructField(_, MapType(kDt, vDt, _), _, _)
271+
if !isComplexType(kDt) && !isComplexType(vDt) =>
272+
true
273+
case _ => false
274+
}
275+
for (field <- mapPrimitiveFields) {
276+
val MapType(kDt, vDt, _) = field.dataType: @unchecked
277+
registerIdentityUdfFor(kDt, s"id_mapk_${field.name}").foreach { udf =>
278+
assertCodegenRan {
279+
checkSparkAnswerAndOperator(s"SELECT $udf(array_max(map_keys(${field.name}))) FROM t2")
280+
}
281+
}
282+
registerIdentityUdfFor(vDt, s"id_mapv_${field.name}").foreach { udf =>
283+
assertCodegenRan {
284+
checkSparkAnswerAndOperator(
285+
s"SELECT $udf(array_max(map_values(${field.name}))) FROM t2")
286+
}
287+
}
288+
}
289+
}
290+
223291
/**
224292
* Probes one complex top-level column. ArrayType / MapType go through `cardinality(col)` fed to
225293
* the identity-Int probe UDF (see [[cardinalityProbeUdf]] for the rationale). StructType drills

0 commit comments

Comments
 (0)