@@ -67,7 +67,7 @@ Fields:
6767- `worker_chans::Dict{Int, Tuple{RemoteChannel,RemoteChannel}}` - Communication channels between the scheduler and each worker
6868- `signature_time_cost::Dict{Signature,UInt64}` - Cache of estimated CPU time (in nanoseconds) required to compute calls with the given signature
6969- `signature_alloc_cost::Dict{Signature,UInt64}` - Cache of estimated CPU RAM (in bytes) required to compute calls with the given signature
70- - `transfer_rate::Ref{ UInt64}` - Estimate of the network transfer rate in bytes per second
70+ - `worker_transfer_rate::Dict{Int,Dict{Processor, UInt64}} ` - Maps from worker ID to per-processor network transfer rate estimates in bytes per second
7171- `halt::Base.Event` - Event indicating that the scheduler is halting
7272- `lock::ReentrantLock` - Lock around operations which modify the state
7373- `futures::Dict{Thunk, Vector{ThunkFuture}}` - Futures registered for waiting on the result of a thunk.
@@ -93,7 +93,7 @@ struct ComputeState
9393 worker_chans:: Dict{Int, Tuple{RemoteChannel,RemoteChannel}}
9494 signature_time_cost:: Dict{Signature,UInt64}
9595 signature_alloc_cost:: Dict{Signature,UInt64}
96- transfer_rate :: Ref{ UInt64}
96+ worker_transfer_rate :: Dict{Int,Dict{Processor, UInt64} }
9797 halt:: Base.Event
9898 lock:: ReentrantLock
9999 futures:: Dict{Thunk, Vector{ThunkFuture}}
@@ -122,7 +122,7 @@ function start_state(deps::Dict, node_order, chan)
122122 Dict {Int, Tuple{RemoteChannel,RemoteChannel}} (),
123123 Dict {Signature,UInt64} (),
124124 Dict {Signature,UInt64} (),
125- Ref { UInt64}( 1_000_000 ),
125+ Dict {Int,Dict{Processor, UInt64}} ( ),
126126 Base. Event (),
127127 ReentrantLock (),
128128 Dict {Thunk, Vector{ThunkFuture}} (),
@@ -157,6 +157,7 @@ function init_proc(state, p, log_sink)
157157 gproc = OSProc (p. pid)
158158 lock (state. lock) do
159159 state. worker_time_pressure[p. pid] = Dict {Processor,UInt64} ()
160+ state. worker_transfer_rate[p. pid] = Dict {Processor,UInt64} ()
160161
161162 state. worker_storage_pressure[p. pid] = Dict {Union{StorageResource,Nothing},UInt64} ()
162163 state. worker_storage_capacity[p. pid] = Dict {Union{StorageResource,Nothing},UInt64} ()
@@ -430,7 +431,12 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options::SchedulerOpt
430431 state. signature_time_cost[sig] = (metadata. threadtime + get (state. signature_time_cost, sig, 0 )) ÷ 2
431432 state. signature_alloc_cost[sig] = (metadata. gc_allocd + get (state. signature_alloc_cost, sig, 0 )) ÷ 2
432433 if metadata. transfer_rate != = nothing
433- state. transfer_rate[] = (state. transfer_rate[] + metadata. transfer_rate) ÷ 2
434+ old_rate = get (state. worker_transfer_rate[pid], proc, UInt64 (0 ))
435+ if old_rate == 0
436+ state. worker_transfer_rate[pid][proc] = metadata. transfer_rate
437+ else
438+ state. worker_transfer_rate[pid][proc] = (old_rate + metadata. transfer_rate) ÷ 2
439+ end
434440 end
435441 end
436442 if res isa Chunk
@@ -736,6 +742,7 @@ function remove_dead_proc!(ctx, state, proc, options)
736742 @assert options. single != = proc. pid " Single worker failed, cannot continue."
737743 rmprocs! (ctx, [proc])
738744 delete! (state. worker_time_pressure, proc. pid)
745+ delete! (state. worker_transfer_rate, proc. pid)
739746 delete! (state. worker_storage_pressure, proc. pid)
740747 delete! (state. worker_storage_capacity, proc. pid)
741748 delete! (state. worker_loadavg, proc. pid)
0 commit comments