11use crate :: common:: { require_one_child, serialize_uuid} ;
22use crate :: coordinator:: metrics_store:: MetricsStore ;
33use crate :: coordinator:: prepare_static_plan:: prepare_static_plan;
4+ use crate :: coordinator:: query_coordinator:: QueryCoordinator ;
45use crate :: distributed_planner:: NetworkBoundaryExt ;
56use crate :: worker:: generated:: worker:: TaskKey ;
67use datafusion:: common:: internal_datafusion_err;
7- use datafusion:: common:: runtime:: JoinSet ;
88use datafusion:: common:: tree_node:: { TreeNode , TreeNodeRecursion } ;
99use datafusion:: common:: { Result , exec_err} ;
1010use datafusion:: execution:: { SendableRecordBatchStream , TaskContext } ;
@@ -15,8 +15,7 @@ use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, Pla
1515use futures:: StreamExt ;
1616use std:: any:: Any ;
1717use std:: fmt:: Formatter ;
18- use std:: sync:: Arc ;
19- use std:: sync:: Mutex ;
18+ use std:: sync:: { Arc , Mutex } ;
2019
2120/// [ExecutionPlan] that executes the inner plan in distributed mode.
2221/// Before executing it, two modifications are lazily performed on the plan:
@@ -27,22 +26,39 @@ use std::sync::Mutex;
2726/// over the wire.
2827#[ derive( Debug ) ]
2928pub struct DistributedExec {
30- plan : Arc < dyn ExecutionPlan > ,
31- prepared_plan : Arc < Mutex < Option < Arc < dyn ExecutionPlan > > > > ,
29+ /// Initial [ExecutionPlan] present before execution.
30+ /// - If the plan was distributed statically, this will be the final distributed plan with all
31+ /// the appropriate network boundaries in it.
32+ /// - If the plan is going to be distributed dynamically during execution, this is the initial
33+ /// non-distributed plan.
34+ base_plan : Arc < dyn ExecutionPlan > ,
35+ /// Resulting [ExecutionPlan] after execution ready for visualization purposes.
36+ /// - If the plan was distributed statically, this is equal to the base plan.
37+ /// - If the plan is going to be distributed dynamically during execution, this is the resulting
38+ /// plan re-calculated based on runtime statistics.
39+ plan_for_viz : Arc < Mutex < Option < Arc < dyn ExecutionPlan > > > > ,
40+ /// The head stage meant to be executed locally on [DistributedExec::execute].
41+ head_stage : Arc < Mutex < Option < Arc < dyn ExecutionPlan > > > > ,
42+ /// DataFusion metrics.
3243 metrics : ExecutionPlanMetricsSet ,
44+ /// Storage where metrics collected from workers at runtime will place their results as they
45+ /// finish their respective remote tasks.
3346 pub ( crate ) metrics_store : Option < Arc < MetricsStore > > ,
3447}
3548
3649pub ( super ) struct PreparedPlan {
50+ /// The head stage meant to be executed locally by the coordinator.
3751 pub ( super ) head_stage : Arc < dyn ExecutionPlan > ,
38- pub ( super ) join_set : JoinSet < Result < ( ) > > ,
52+ /// A final representation of the plan for visualization purposes.
53+ pub ( super ) plan_for_viz : Arc < dyn ExecutionPlan > ,
3954}
4055
4156impl DistributedExec {
42- pub fn new ( plan : Arc < dyn ExecutionPlan > ) -> Self {
57+ pub fn new ( base_plan : Arc < dyn ExecutionPlan > ) -> Self {
4358 Self {
44- plan,
45- prepared_plan : Arc :: new ( Mutex :: new ( None ) ) ,
59+ base_plan,
60+ plan_for_viz : Arc :: new ( Mutex :: new ( None ) ) ,
61+ head_stage : Arc :: new ( Mutex :: new ( None ) ) ,
4662 metrics : ExecutionPlanMetricsSet :: new ( ) ,
4763 metrics_store : None ,
4864 }
@@ -69,7 +85,10 @@ impl DistributedExec {
6985 let Some ( task_metrics) = & self . metrics_store else {
7086 return ;
7187 } ;
72- let _ = self . plan . apply ( |plan| {
88+ let Some ( plan) = self . plan_for_viz . lock ( ) . unwrap ( ) . as_ref ( ) . cloned ( ) else {
89+ return ;
90+ } ;
91+ let _ = plan. apply ( |plan| {
7392 if let Some ( boundary) = plan. as_network_boundary ( ) {
7493 let stage = boundary. input_stage ( ) ;
7594 for i in 0 ..stage. task_count ( ) {
@@ -94,15 +113,27 @@ impl DistributedExec {
94113 /// Returns the plan which is lazily prepared on `execute()` and actually gets executed.
95114 /// It is updated on every call to `execute()`. Returns an error if `.execute()` has not been
96115 /// called.
97- pub ( crate ) fn prepared_plan ( & self ) -> Result < Arc < dyn ExecutionPlan > > {
98- self . prepared_plan
116+ pub ( crate ) fn plan_for_viz ( & self ) -> Result < Arc < dyn ExecutionPlan > > {
117+ self . plan_for_viz
99118 . lock ( )
100119 . map_err ( |e| internal_datafusion_err ! ( "Failed to lock prepared plan: {}" , e) ) ?
101120 . clone ( )
102121 . ok_or_else ( || {
103122 internal_datafusion_err ! ( "No prepared plan found. Was execute() called?" )
104123 } )
105124 }
125+
126+ /// Returns the head stage that was actually executed. Unlike [`Self::plan_for_viz`] (which is
127+ /// reconstructed for visualization, with `Stage::Local` boundaries and rebuilt ancestor
128+ /// `Arc`s), this returns the original `Arc` instances whose metrics were populated during
129+ /// execution.
130+ pub ( crate ) fn head_stage ( & self ) -> Result < Arc < dyn ExecutionPlan > > {
131+ self . head_stage
132+ . lock ( )
133+ . map_err ( |e| internal_datafusion_err ! ( "Failed to lock head stage: {}" , e) ) ?
134+ . clone ( )
135+ . ok_or_else ( || internal_datafusion_err ! ( "No head stage found. Was execute() called?" ) )
136+ }
106137}
107138
108139impl DisplayAs for DistributedExec {
@@ -121,20 +152,21 @@ impl ExecutionPlan for DistributedExec {
121152 }
122153
123154 fn properties ( & self ) -> & Arc < PlanProperties > {
124- self . plan . properties ( )
155+ self . base_plan . properties ( )
125156 }
126157
127158 fn children ( & self ) -> Vec < & Arc < dyn ExecutionPlan > > {
128- vec ! [ & self . plan ]
159+ vec ! [ & self . base_plan ]
129160 }
130161
131162 fn with_new_children (
132163 self : Arc < Self > ,
133164 children : Vec < Arc < dyn ExecutionPlan > > ,
134165 ) -> Result < Arc < dyn ExecutionPlan > > {
135166 Ok ( Arc :: new ( DistributedExec {
136- plan : require_one_child ( & children) ?,
137- prepared_plan : self . prepared_plan . clone ( ) ,
167+ base_plan : require_one_child ( & children) ?,
168+ plan_for_viz : Arc :: new ( Mutex :: new ( None ) ) ,
169+ head_stage : Arc :: new ( Mutex :: new ( None ) ) ,
138170 metrics : self . metrics . clone ( ) ,
139171 metrics_store : self . metrics_store . clone ( ) ,
140172 } ) )
@@ -155,36 +187,43 @@ impl ExecutionPlan for DistributedExec {
155187 ) ;
156188 }
157189
158- let PreparedPlan {
159- head_stage,
160- join_set,
161- } = prepare_static_plan ( & self . plan , & self . metrics , & self . metrics_store , & context) ?;
162- {
163- let mut guard = self
164- . prepared_plan
165- . lock ( )
166- . map_err ( |e| internal_datafusion_err ! ( "Failed to lock prepared plan: {e}" ) ) ?;
167- * guard = Some ( head_stage. clone ( ) ) ;
168- }
190+ let base_plan = Arc :: clone ( & self . base_plan ) ;
191+ let plan_for_viz = Arc :: clone ( & self . plan_for_viz ) ;
192+ let head_stage = Arc :: clone ( & self . head_stage ) ;
193+
194+ let query_coordinator = QueryCoordinator :: new (
195+ Arc :: clone ( & context) ,
196+ & self . metrics ,
197+ self . metrics_store . clone ( ) ,
198+ ) ;
199+
169200 let mut builder = RecordBatchReceiverStreamBuilder :: new ( self . schema ( ) , 1 ) ;
170201 let tx = builder. tx ( ) ;
171- // Spawn the task that pulls data from child...
202+
172203 builder. spawn ( async move {
173- let mut stream = head_stage. execute ( partition, context) ?;
204+ let _guard = query_coordinator. end_query_guard ( ) ;
205+
206+ let result = prepare_static_plan ( & query_coordinator, & base_plan) ?;
207+
208+ plan_for_viz
209+ . lock ( )
210+ . expect ( "poisoned lock" )
211+ . replace ( result. plan_for_viz ) ;
212+ head_stage
213+ . lock ( )
214+ . expect ( "poisoned lock" )
215+ . replace ( Arc :: clone ( & result. head_stage ) ) ;
216+ let mut stream = result. head_stage . execute ( partition, context) ?;
174217 while let Some ( msg) = stream. next ( ) . await {
175218 if tx. send ( msg) . await . is_err ( ) {
176219 break ; // channel closed
177220 }
178221 }
222+ drop ( tx) ;
223+ query_coordinator. drain_pending_tasks ( ) . await ?;
179224 Ok ( ( ) )
180225 } ) ;
181- // ...in parallel to the one that feeds the plan to workers.
182- builder. spawn ( async move {
183- for res in join_set. join_all ( ) . await {
184- res?;
185- }
186- Ok ( ( ) )
187- } ) ;
226+
188227 Ok ( builder. build ( ) )
189228 }
190229
0 commit comments