fallback to jobGroupid if jobrunid=taskrunid (#11199)

aboitreaud · web-flow · commit e2d98c03c3b5 · 2026-04-27T13:04:24.000Z
# What Does This Do On Databricks 18.2, `spark.databricks.job.runId` returns the job run ID instead of the task run ID. This causes `DatabricksParentContext` to compute identical trace and span IDs, orphaning Spark spans from their Databricks parent. Fix: skip `spark.databricks.job.runId` when it equals the job run ID and fall through to `spark.jobGroup.id` extraction. # Motivation Broken parent-child relationship between Databricks job/task spans and Spark spans on Databricks 18.2. The `spark.databricks.job.runId` property semantics changed in Databricks 18.2 — it now returns the job-level run ID rather than the task-level run ID. Prior versions are unaffected. # QA Examples Trace before: https://ddstaging.datadoghq.com/apm/traces?query=%40_trace_root%3A1%20service%3Aall-purpose&agg_m=count&agg_m_source=base&agg_t=count&cols=service%2Cresource_name%2C%40duration%2C%40http.method%2C%40http.status_code%2C%40_span.count%2C%40_duration.by_service&fromUser=false&graphType=flamegraph&historicalData=true&messageDisplay=inline&query_translation_version=v0&shouldShowLegend=true&sort=desc&spanID=2159132286262269065&spanType=trace-root&storage=hot&timeHint=1777044603326&trace=AwAAAZ3AHMG-ib6LIAAAABhBWjNBSGtRNUFBQ19GLUxjWVlIV2t2WWQAAAAkMTE5ZGMwMzgtMmFmOS00NTFlLTk2YWYtZWFjNTQxOWY3YjhhAAAAfg&traceID=2159132286262269063&traceQuery=&view=spans&start=1776931657013&end=1777277257013&paused=false trace after https://ddstaging.datadoghq.com/apm/traces?query=%40_trace_root%3A1%20service%3Aall-purpose&agg_m=count&agg_m_source=base&agg_t=count&cols=service%2Cresource_name%2C%40duration%2C%40http.method%2C%40http.status_code%2C%40_span.count%2C%40_duration.by_service&fromUser=false&graphType=flamegraph&historicalData=true&messageDisplay=inline&query_translation_version=v0&shouldShowLegend=true&sort=desc&spanID=10469624543607262827&spanType=trace-root&storage=hot&timeHint=1777276836597&trace=AwAAAZ3N9Fr1O6CLVgAAABhBWjNOOXhfLUFBQ3NPZURNalpLQkZrdVIAAAAkZjE5ZGNkZjctMmI4ZC00YWJkLWJlZmYtYThlNGJjMmE3Zjk2AAAAAQ&traceID=10469624543607262825&traceQuery=&view=spans&start=1777190817655&end=1777277217655&paused=false # Contributor Checklist - Format the title according to [the contribution guidelines](https://github.com/DataDog/dd-trace-java/blob/master/CONTRIBUTING.md#title-format) - Assign the `type:` and (`comp:` or `inst:`) labels in addition to [any other useful labels](https://github.com/DataDog/dd-trace-java/blob/master/CONTRIBUTING.md#labels) - Avoid using `close`, `fix`, or [any linking keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword) when referencing an issue Use `solves` instead, and assign the PR [milestone](https://github.com/DataDog/dd-trace-java/milestones) to the issue - Update the [CODEOWNERS](https://github.com/DataDog/dd-trace-java/blob/master/.github/CODEOWNERS) file on source file addition, migration, or deletion - Update [public documentation](https://docs.datadoghq.com/tracing/trace_collection/library_config/java/) with any new configuration flags or behaviors  Co-authored-by: adrien.boitreaud <adrien.boitreaud@datadoghq.com>
diff --git a/dd-java-agent/instrumentation/spark/spark-common/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java b/dd-java-agent/instrumentation/spark/spark-common/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java
@@ -426,7 +426,7 @@ private void addDatabricksSpecificTags(
     if (properties != null) {
       String databricksJobId = getDatabricksJobId(properties);
       String databricksJobRunId = getDatabricksJobRunId(properties, databricksClusterName);
-      String databricksTaskRunId = getDatabricksTaskRunId(properties);
+      String databricksTaskRunId = getDatabricksTaskRunId(properties, databricksJobRunId);
 
       // ids to link those spans to databricks job/task traces
       builder.withTag("databricks_job_id", databricksJobId);
@@ -1177,10 +1177,14 @@ private static String getDatabricksJobRunId(
   }
 
   @SuppressForbidden // split with one-char String use a fast-path without regex usage
-  private static String getDatabricksTaskRunId(Properties properties) {
-    // spark.databricks.job.runId is the runId of the task, not of the Job
+  private static String getDatabricksTaskRunId(Properties properties, String jobRunId) {
+    // spark.databricks.job.runId is the runId of the task, not of the Job, until Databricks 18.2
     String taskRunId = properties.getProperty("spark.databricks.job.runId");
-    if (taskRunId != null) {
+    // On Databricks 18.2+, spark.databricks.job.runId now returns the job run ID
+    // There is no easy config key to extract the task run ID, so we use the fallback extraction
+    // methods
+    // Task run ID is crucial for the spans parent-child relationship inside the trace
+    if (taskRunId != null && !taskRunId.equals(jobRunId)) {
       return taskRunId;
     }
 
diff --git a/dd-java-agent/instrumentation/spark/spark-common/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSparkStructuredStreamingTest.groovy b/dd-java-agent/instrumentation/spark/spark-common/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSparkStructuredStreamingTest.groovy
@@ -44,7 +44,7 @@ class AbstractSparkStructuredStreamingTest extends InstrumentationSpecification
       .config("spark.databricks.sparkContextId", "3291395623902517763")
       .config("spark.databricks.job.id", "3822225623902514353")
       .config("spark.databricks.job.parentRunId", "3851395623902519743")
-      .config("spark.databricks.job.runId", "3851395623902519743")
+      .config("spark.databricks.job.runId", "4851395623902519743")
       .getOrCreate()
   }
 
@@ -303,7 +303,7 @@ class AbstractSparkStructuredStreamingTest extends InstrumentationSpecification
           spanType "spark"
           parent()
           links({
-            link(DDTraceId.from((long)12052652441736835200), (long)-6394091631972716416)
+            link(DDTraceId.from((long)12052652441736835200), (long)1375416004467624525)
           })
         }
         span {
diff --git a/dd-java-agent/instrumentation/spark/spark-common/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSparkTest.groovy b/dd-java-agent/instrumentation/spark/spark-common/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSparkTest.groovy
@@ -422,6 +422,50 @@ abstract class AbstractSparkTest extends InstrumentationSpecification {
     sparkSession.stop()
   }
 
+  def "fallback to jobGroup.id when spark.databricks.job.runId equals parentRunId on Databricks 18.2+"() {
+    setup:
+    def sparkSession = SparkSession.builder()
+      .config("spark.master", "local")
+      .config("spark.default.parallelism", "2")
+      .config("spark.sql.shuffle.partitions", "2")
+      .config("spark.databricks.sparkContextId", "some_id")
+      .getOrCreate()
+
+    sparkSession.sparkContext().setLocalProperty("spark.databricks.job.id", "1234")
+    sparkSession.sparkContext().setLocalProperty("spark.databricks.job.runId", "5678") // Same as parentRunId
+    sparkSession.sparkContext().setLocalProperty("spark.jobGroup.id", "0000_job-1234-run-7890-action-0000")
+    sparkSession.sparkContext().setLocalProperty("spark.databricks.job.parentRunId", "5678")
+    TestSparkComputation.generateTestSparkComputation(sparkSession)
+
+    expect:
+    assertTraces(1) {
+      trace(3) {
+        span {
+          operationName "spark.job"
+          spanType "spark"
+          traceId 8944764253919609482G
+          parentSpanId 3503717452567411167G
+          assert span.tags["databricks_job_id"] == "1234"
+          assert span.tags["databricks_job_run_id"] == "5678"
+          assert span.tags["databricks_task_run_id"] == "7890"
+        }
+        span {
+          operationName "spark.stage"
+          spanType "spark"
+          childOf(span(0))
+        }
+        span {
+          operationName "spark.stage"
+          spanType "spark"
+          childOf(span(0))
+        }
+      }
+    }
+
+    cleanup:
+    sparkSession.stop()
+  }
+
   def "compute the databricks parent context"() {
     setup:
     def contextWithJobRunId = new DatabricksParentContext("1234", "5678", "9012")