@@ -192,7 +192,6 @@ class CometTaskMetricsSuite extends CometTestBase with AdaptiveSparkPlanHelper {
192192 }
193193
194194 test(" native_datafusion scan reports task-level input metrics matching Spark" ) {
195- assume(! isSpark41Plus, " https://github.com/apache/datafusion-comet/issues/4098" )
196195 val totalRows = 10000
197196 withTempPath { dir =>
198197 spark
@@ -230,15 +229,11 @@ class CometTaskMetricsSuite extends CometTestBase with AdaptiveSparkPlanHelper {
230229 // (e.g. footer reads, page headers, buffering granularity).
231230 assert(sparkBytes > 0 , s " Spark bytesRead should be > 0, got $sparkBytes" )
232231 assert(cometBytes > 0 , s " Comet bytesRead should be > 0, got $cometBytes" )
233- val ratio = cometBytes.toDouble / sparkBytes.toDouble
234- assert(
235- ratio >= 0.7 && ratio <= 1.3 ,
236- s " bytesRead ratio out of range: comet= $cometBytes, spark= $sparkBytes, ratio= $ratio" )
232+ assertCometBytesReadInRange(cometBytes, sparkBytes)
237233 }
238234 }
239235
240236 test(" input metrics aggregate across multiple native scans in a join" ) {
241- assume(! isSpark41Plus, " https://github.com/apache/datafusion-comet/issues/4098" )
242237 withTempPath { dir1 =>
243238 withTempPath { dir2 =>
244239 // Create two separate parquet tables
@@ -283,16 +278,34 @@ class CometTaskMetricsSuite extends CometTestBase with AdaptiveSparkPlanHelper {
283278 assert(cometRecords > 0 , s " Comet recordsRead should be > 0, got $cometRecords" )
284279
285280 // Both sides should contribute to the total bytes
286- val ratio = cometBytes.toDouble / sparkBytes.toDouble
287- assert(
288- ratio >= 0.7 && ratio <= 1.3 ,
289- s " bytesRead ratio out of range: comet= $cometBytes, spark= $sparkBytes, ratio= $ratio" )
281+ assertCometBytesReadInRange(cometBytes, sparkBytes)
290282 }
291283 }
292284 }
293285
286+ /**
287+ * Compare Comet's `bytesRead` against Spark's baseline. On Spark <= 4.0 the two readers report
288+ * the same Hadoop-FS thread-local byte count, so we keep a tight 0.7-1.3 band. Spark 4.1
289+ * pre-opens the parquet `SeekableInputStream` outside the FileScanRDD `compute()` thread, so
290+ * its `getFSBytesReadOnThreadCallback` no longer captures most of the parquet IO and
291+ * `inputMetrics.bytesRead` is now a small fraction of the actual file IO. Comet (via
292+ * DataFusion's `bytes_scanned`) still reports actual bytes, so the only safe cross-version
293+ * invariant on 4.1+ is that Comet >= Spark and both are positive.
294+ */
295+ private def assertCometBytesReadInRange (cometBytes : Long , sparkBytes : Long ): Unit = {
296+ if (isSpark41Plus) {
297+ assert(
298+ cometBytes >= sparkBytes,
299+ s " Comet bytesRead should be >= Spark bytesRead on 4.1+: comet= $cometBytes, spark= $sparkBytes" )
300+ } else {
301+ val ratio = cometBytes.toDouble / sparkBytes.toDouble
302+ assert(
303+ ratio >= 0.7 && ratio <= 1.3 ,
304+ s " bytesRead ratio out of range: comet= $cometBytes, spark= $sparkBytes, ratio= $ratio" )
305+ }
306+ }
307+
294308 test(" input metrics aggregate across multiple native scans in a union" ) {
295- assume(! isSpark41Plus, " https://github.com/apache/datafusion-comet/issues/4098" )
296309 withTempPath { dir1 =>
297310 withTempPath { dir2 =>
298311 spark
@@ -335,10 +348,7 @@ class CometTaskMetricsSuite extends CometTestBase with AdaptiveSparkPlanHelper {
335348 assert(sparkRecords > 0 , s " Spark recordsRead should be > 0, got $sparkRecords" )
336349 assert(cometRecords > 0 , s " Comet recordsRead should be > 0, got $cometRecords" )
337350
338- val ratio = cometBytes.toDouble / sparkBytes.toDouble
339- assert(
340- ratio >= 0.7 && ratio <= 1.3 ,
341- s " bytesRead ratio out of range: comet= $cometBytes, spark= $sparkBytes, ratio= $ratio" )
351+ assertCometBytesReadInRange(cometBytes, sparkBytes)
342352 }
343353 }
344354 }
0 commit comments