Skip to content

Commit b6b7a0b

Browse files
authored
Release stale deployment locks and add TTL (#107)
* Release stale deployment locks and add TTL Reclaim stale deployment locks and prevent permanent lockout: aa-core now releases a stale lock when the account isn't deployed and returns NotDeployed so callers can retry. The external bundler uses Redis SET with NX + EX via SetOptions/SetExpiry and adds a 300s fallback LOCK_TTL_SECONDS to ensure locks auto-expire if a holder dies. This avoids stuck deployments and makes lock acquisition atomic with an expiry. * Add release_lock_if_owner and use atomic reclaim Add a DeploymentLock::release_lock_if_owner method and use it when reclaiming stale deployment locks to avoid deleting a lock that another worker may have acquired. Implemented RedisDeploymentLock::release_lock_if_owner with an atomic compare-and-delete Lua script (GET + cjson.decode + DEL if lock_id matches) and map Redis errors to EngineError. Update aa-core deployment flow to call the new method instead of unconditionally deleting the lock.
1 parent b6f39f4 commit b6b7a0b

2 files changed

Lines changed: 62 additions & 4 deletions

File tree

aa-core/src/userop/deployment.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ pub trait DeploymentLock: Send + Sync {
4242
chain_id: u64,
4343
account_address: &Address,
4444
) -> impl Future<Output = Result<bool, EngineError>> + Send;
45+
46+
/// Release a deployment lock only if it still holds the given `lock_id`.
47+
/// Atomic compare-and-delete; returns true if a matching lock was removed.
48+
fn release_lock_if_owner(
49+
&self,
50+
chain_id: u64,
51+
account_address: &Address,
52+
lock_id: &str,
53+
) -> impl Future<Output = Result<bool, EngineError>> + Send;
4554
}
4655

4756
pub enum DeploymentStatus {
@@ -104,9 +113,16 @@ where
104113
if is_deployed {
105114
return Ok(DeploymentStatus::Deployed);
106115
}
116+
117+
// Stale lock, not deployed: previous holder died without releasing.
118+
// Reclaim only the lock we observed, so we don't delete a lock that
119+
// another worker acquired while we were checking chain state.
120+
self.lock
121+
.release_lock_if_owner(chain_id, account_address, &lock_id)
122+
.await?;
123+
return Ok(DeploymentStatus::NotDeployed);
107124
}
108125

109-
// Either fresh lock or stale but not deployed
110126
return Ok(DeploymentStatus::BeingDeployed {
111127
stale: is_stale,
112128
lock_id,

executors/src/external_bundler/deployment.rs

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,17 @@ use serde::{Deserialize, Serialize};
77
use std::time::{Duration, SystemTime, UNIX_EPOCH};
88
use twmq::{
99
error::TwmqError,
10-
redis::{AsyncCommands, Pipeline, aio::ConnectionManager},
10+
redis::{AsyncCommands, Pipeline, SetExpiry, SetOptions, aio::ConnectionManager},
1111
};
1212
use uuid::Uuid;
1313

1414
const CACHE_PREFIX: &str = "deployment_cache";
1515
const LOCK_PREFIX: &str = "deployment_lock";
1616

17+
/// Fallback TTL so a lock that's never explicitly released (e.g. worker crash)
18+
/// can't block the account forever.
19+
const LOCK_TTL_SECONDS: u64 = 300;
20+
1721
#[derive(Clone)]
1822
pub struct RedisDeploymentCache {
1923
connection_manager: twmq::redis::aio::ConnectionManager,
@@ -165,9 +169,13 @@ impl DeploymentLock for RedisDeploymentLock {
165169
message: format!("Serialization failed: {e}"),
166170
})?;
167171

168-
// Use SET NX EX for atomic acquire
172+
// SET NX EX: atomic acquire with a fallback expiry.
173+
let opts = SetOptions::default()
174+
.conditional_set(twmq::redis::ExistenceCheck::NX)
175+
.with_expiration(SetExpiry::EX(LOCK_TTL_SECONDS));
176+
169177
let result: Option<String> =
170-
conn.set_nx(&key, &lock_data_str)
178+
conn.set_options(&key, &lock_data_str, opts)
171179
.await
172180
.map_err(|e| EngineError::InternalError {
173181
message: format!("Lock acquire failed: {e}"),
@@ -212,4 +220,38 @@ impl DeploymentLock for RedisDeploymentLock {
212220

213221
Ok(deleted > 0)
214222
}
223+
224+
async fn release_lock_if_owner(
225+
&self,
226+
chain_id: u64,
227+
account_address: &Address,
228+
lock_id: &str,
229+
) -> Result<bool, EngineError> {
230+
let mut conn = self.conn().clone();
231+
let key = self.lock_key(chain_id, account_address);
232+
233+
// Atomic compare-and-delete: only DEL if the stored lock's lock_id matches.
234+
let script = twmq::redis::Script::new(
235+
r#"
236+
local v = redis.call('GET', KEYS[1])
237+
if not v then return 0 end
238+
local ok, data = pcall(cjson.decode, v)
239+
if ok and data.lock_id == ARGV[1] then
240+
return redis.call('DEL', KEYS[1])
241+
end
242+
return 0
243+
"#,
244+
);
245+
246+
let deleted: i64 = script
247+
.key(&key)
248+
.arg(lock_id)
249+
.invoke_async(&mut conn)
250+
.await
251+
.map_err(|e| EngineError::InternalError {
252+
message: format!("Failed to release lock for account {account_address}: {e}"),
253+
})?;
254+
255+
Ok(deleted > 0)
256+
}
215257
}

0 commit comments

Comments
 (0)