Skip to content

Commit ecadfb0

Browse files
author
Zach Birenbaum
committed
Remove old actions with no listeners
Implement scheduler side removal of actions with no listeners. Adds disconnect_timeout_s configuration field with default of 60s. If the client waiting on a given action is disconnected for longer than this duration without reconnecting the scheduler will stop tracking it. This does not remove it from the worker if the job has already been dispatched. fixes #338
1 parent aa599c3 commit ecadfb0

3 files changed

Lines changed: 131 additions & 1 deletion

File tree

nativelink-config/src/schedulers.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@ pub struct SimpleScheduler {
119119
/// The strategy used to assign workers jobs.
120120
#[serde(default)]
121121
pub allocation_strategy: WorkerAllocationStrategy,
122+
123+
/// Remove action from queue after this much time has elapsed without a listener
124+
/// amount of time in seconds.
125+
/// Default: 60 (seconds)
126+
#[serde(default, deserialize_with = "convert_numeric_with_shellexpand")]
127+
pub disconnect_timeout_s: u64,
122128
}
123129

124130
/// A scheduler that simply forwards requests to an upstream scheduler. This

nativelink-scheduler/src/simple_scheduler.rs

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use std::collections::BTreeMap;
1818
use std::hash::{Hash, Hasher};
1919
use std::sync::atomic::{AtomicU64, Ordering};
2020
use std::sync::Arc;
21-
use std::time::{Instant, SystemTime};
21+
use std::time::{Instant, SystemTime, UNIX_EPOCH};
2222

2323
use async_trait::async_trait;
2424
use futures::Future;
@@ -57,6 +57,10 @@ const DEFAULT_RETAIN_COMPLETED_FOR_S: u64 = 60;
5757
/// If this changes, remember to change the documentation in the config.
5858
const DEFAULT_MAX_JOB_RETRIES: usize = 3;
5959

60+
/// Default timeout for actions without any listeners
61+
/// If this changes, remember to change the documentation in the config.
62+
const DEFAULT_DISCONNECT_TIMEOUT_S: u64 = 60;
63+
6064
/// An action that is being awaited on and last known state.
6165
struct AwaitedAction {
6266
action_info: Arc<ActionInfo>,
@@ -68,14 +72,36 @@ struct AwaitedAction {
6872
/// Possible last error set by the worker. If empty and attempts is set, it may be due to
6973
/// something like a worker timeout.
7074
last_error: Option<Error>,
75+
76+
/// Updated on every client connect and periodically while it has listeners.
77+
last_update_timestamp: Mutex<u64>
7178
}
7279

80+
impl AwaitedAction {
81+
pub fn set_last_update_timestamp(&self, timestamp: u64) {
82+
let mut guard = self.last_update_timestamp.lock();
83+
*guard = timestamp;
84+
}
85+
pub fn get_last_update_timestamp(&self) -> u64 {
86+
let guard = self.last_update_timestamp.lock();
87+
*guard
88+
}
89+
}
7390
/// Holds the relationship of a worker that is executing a specific action.
7491
struct RunningAction {
7592
worker_id: WorkerId,
7693
action: AwaitedAction,
7794
}
7895

96+
impl RunningAction {
97+
pub fn set_last_update_timestamp(&self, timestamp: u64) {
98+
self.action.set_last_update_timestamp(timestamp);
99+
}
100+
pub fn get_last_update_timestamp(&self) -> u64 {
101+
self.action.get_last_update_timestamp()
102+
}
103+
}
104+
79105
struct Workers {
80106
workers: LruCache<WorkerId, Worker>,
81107
/// The allocation strategy for workers.
@@ -230,6 +256,8 @@ struct SimpleSchedulerImpl {
230256
/// Notify task<->worker matching engine that work needs to be done.
231257
tasks_or_workers_change_notify: Arc<Notify>,
232258
metrics: Arc<Metrics>,
259+
/// How long the server will wait for a client to reconnect before removing the action from the queue.
260+
disconnect_timeout_s: u64,
233261
}
234262

235263
impl SimpleSchedulerImpl {
@@ -307,6 +335,7 @@ impl SimpleSchedulerImpl {
307335
notify_channel: tx,
308336
attempts: 0,
309337
last_error: None,
338+
last_update_timestamp: Mutex::new(0)
310339
},
311340
);
312341

@@ -428,6 +457,7 @@ impl SimpleSchedulerImpl {
428457
Ok(())
429458
}
430459

460+
431461
// TODO(blaise.bruer) This is an O(n*m) (aka n^2) algorithm. In theory we can create a map
432462
// of capabilities of each worker and then try and match the actions to the worker using
433463
// the map lookup (ie. map reduce).
@@ -440,6 +470,15 @@ impl SimpleSchedulerImpl {
440470
let action_infos: Vec<Arc<ActionInfo>> =
441471
self.queued_actions.keys().rev().cloned().collect();
442472
for action_info in action_infos {
473+
// add update to queued action update timestamp here
474+
let action = self.queued_actions.get_mut(&action_info).unwrap();
475+
let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs();
476+
if action.notify_channel.receiver_count() > 0 {
477+
action.set_last_update_timestamp(now)
478+
} else if action.get_last_update_timestamp() + self.disconnect_timeout_s < now {
479+
self.queued_actions_set.remove(&action_info);
480+
self.queued_actions.remove(&action_info);
481+
}
443482
let Some(awaited_action) = self.queued_actions.get(action_info.as_ref()) else {
444483
error!(
445484
"queued_actions out of sync with itself for action {}",
@@ -501,6 +540,17 @@ impl SimpleSchedulerImpl {
501540
},
502541
);
503542
}
543+
544+
let mut remove_actions = Vec::new();
545+
let running_actions = &mut self.active_actions.values().collect::<Vec<_>>();
546+
for running_action in running_actions {
547+
if running_action.action.notify_channel.receiver_count() > 0 {
548+
running_action.set_last_update_timestamp(SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs());
549+
} else if running_action.get_last_update_timestamp() + self.disconnect_timeout_s < SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs() {
550+
remove_actions.push(running_action.action.action_info.clone())
551+
}
552+
}
553+
self.active_actions.retain(|x, _| { !remove_actions.contains(x) });
504554
}
505555

506556
fn update_action_with_internal_error(
@@ -694,6 +744,11 @@ impl SimpleScheduler {
694744
max_job_retries = DEFAULT_MAX_JOB_RETRIES;
695745
}
696746

747+
let mut disconnect_timeout_s = scheduler_cfg.disconnect_timeout_s;
748+
if disconnect_timeout_s == 0 {
749+
disconnect_timeout_s = DEFAULT_DISCONNECT_TIMEOUT_S;
750+
}
751+
697752
let tasks_or_workers_change_notify = Arc::new(Notify::new());
698753

699754
let metrics = Arc::new(Metrics::default());
@@ -709,6 +764,7 @@ impl SimpleScheduler {
709764
max_job_retries,
710765
tasks_or_workers_change_notify: tasks_or_workers_change_notify.clone(),
711766
metrics: metrics.clone(),
767+
disconnect_timeout_s
712768
}));
713769
let weak_inner = Arc::downgrade(&inner);
714770
Self {

nativelink-scheduler/tests/simple_scheduler_test.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ async fn setup_action(
9696
#[cfg(test)]
9797
mod scheduler_tests {
9898
use pretty_assertions::assert_eq;
99+
use tokio::time::sleep;
99100

100101
use super::*; // Must be declared in every module.
101102

@@ -1602,4 +1603,71 @@ mod scheduler_tests {
16021603

16031604
Ok(())
16041605
}
1606+
1607+
#[tokio::test]
1608+
async fn ensure_actions_with_disconnected_clients_are_dropped() -> Result<(), Error> {
1609+
const WORKER_ID: WorkerId = WorkerId(0x1234_5678_9111);
1610+
const DISCONNECT_TIMEOUT_S: u64 = 1;
1611+
1612+
let scheduler = SimpleScheduler::new_with_callback(
1613+
&nativelink_config::schedulers::SimpleScheduler {
1614+
disconnect_timeout_s: DISCONNECT_TIMEOUT_S,
1615+
..Default::default()
1616+
},
1617+
|| async move {},
1618+
);
1619+
let action1_digest = DigestInfo::new([98u8; 32], 512);
1620+
let action2_digest = DigestInfo::new([99u8; 32], 512);
1621+
1622+
let mut rx_from_worker =
1623+
setup_new_worker(&scheduler, WORKER_ID, PlatformProperties::default()).await?;
1624+
let insert_timestamp = make_system_time(1);
1625+
1626+
let client_rx = setup_action(
1627+
&scheduler,
1628+
action1_digest,
1629+
PlatformProperties::default(),
1630+
insert_timestamp,
1631+
)
1632+
.await?;
1633+
1634+
// Drop our receiver
1635+
let unique_qualifier = client_rx.borrow().unique_qualifier.clone();
1636+
drop(client_rx);
1637+
1638+
// Allow task<->worker matcher to run.
1639+
tokio::task::yield_now().await;
1640+
1641+
// Sleep for longer than disconnect_timeout_s
1642+
let _ = sleep(Duration::from_secs(DISCONNECT_TIMEOUT_S + 1)).await;
1643+
1644+
{
1645+
// Other tests check full data. We only care if we got StartAction.
1646+
match rx_from_worker.recv().await.unwrap().update {
1647+
Some(update_for_worker::Update::StartAction(_)) => { /* Success */ }
1648+
v => panic!("Expected StartAction, got : {v:?}"),
1649+
}
1650+
}
1651+
1652+
// Setup a second action so matching engine is scheduled to rerun
1653+
let client_rx = setup_action(
1654+
&scheduler,
1655+
action2_digest,
1656+
PlatformProperties::default(),
1657+
insert_timestamp,
1658+
)
1659+
.await?;
1660+
drop(client_rx);
1661+
1662+
// Allow task<->worker matcher to run.
1663+
tokio::task::yield_now().await;
1664+
1665+
// Check to make sure that the action was removed
1666+
assert!(scheduler
1667+
.find_existing_action(&unique_qualifier)
1668+
.await
1669+
.is_none(),);
1670+
1671+
Ok(())
1672+
}
16051673
}

0 commit comments

Comments
 (0)