Skip to content

Commit 8444c22

Browse files
committed
Implement Weeks 4 & 5: Deployment Safety + Circuit Breaker
Week 4: Deployment Safety Checks (COMPLETE) ============================================ Created pre_deploy_check.sh: - Verifies binary exists and size - Runs full test suite - Queries live network for current round - Checks high-water mark vs network state - Detects state lag (warns if >100 rounds, fails if >1000) - Checks for uncommitted changes - Interactive confirmation before deploy Created deploy_fly.sh: - Runs pre_deploy_check.sh automatically - Deploys to all 4 Fly.io nodes - Verifies deployment success - Checks node health after deploy Features: - Prevents deploying with stale state - Detects potential rollbacks before they happen - Network-specific configuration (testnet/mainnet) - Clear error messages with recovery instructions Week 5: Emergency Circuit Breaker (COMPLETE) ============================================ Created crates/ultradag-coin/src/safety/circuit_breaker.rs: - CircuitBreaker struct with atomic round tracking - check_finality() halts process if rollback detected - check_liveness(- check_liveness(- check_liveness(- cheed er- check_liveness(- check_liveness(- check_lxi- check_liveness(- check_lake- check_liveness(- check_livenesfo- check_liveness(- Tests: - 5 unit tests passing - Tests forward- Tests forward- Tests forward- Tese - Can- Can- Can- Can- Can- Can- Can- Can- Can- Can- Can- Canest)- Can- Can- Can- Can- Can- Ca ult- Can- Can- Can- Can- Can- Can- Can- Ca1, - Can- Can- Can- C: Weeks 2, 3, 6
1 parent 2ac19d6 commit 8444c22

8 files changed

Lines changed: 462 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/ultradag-coin/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ ed25519-dalek = { workspace = true }
1313
rand = { workspace = true }
1414
chrono = { workspace = true }
1515
thiserror = { workspace = true }
16+
tracing = { workspace = true }
1617

1718
[dev-dependencies]
1819
serde_json = "1"

crates/ultradag-coin/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ pub mod constants;
55
pub mod error;
66
pub mod block_producer;
77
pub mod persistence;
8+
pub mod safety;
89
pub mod state;
910
pub mod tx;
1011

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
use std::sync::atomic::{AtomicU64, Ordering};
2+
use tracing::{error, warn};
3+
4+
/// Emergency circuit breaker that halts the node if a rollback is detected.
5+
/// This is the last line of defense against state corruption.
6+
pub struct CircuitBreaker {
7+
/// Last finalized round seen
8+
last_finalized: AtomicU64,
9+
/// Whether circuit breaker is enabled
10+
enabled: bool,
11+
}
12+
13+
impl CircuitBreaker {
14+
/// Create a new circuit breaker
15+
pub fn new(enabled: bool) -> Self {
16+
Self {
17+
last_finalized: AtomicU64::new(0),
18+
enabled,
19+
}
20+
}
21+
22+
/// Check if round is moving forward
23+
/// HALTS THE PROCESS if rollback detected
24+
pub fn check_finality(&self, current_round: u64) {
25+
if !self.enabled {
26+
// When disabled, still track but don't enforce
27+
self.last_finalized.store(current_round, Ordering::SeqCst);
28+
return;
29+
}
30+
31+
let last = self.last_finalized.load(Ordering::SeqCst);
32+
33+
if current_round < last {
34+
// CRITICAL: ROLLBACK DETECTED
35+
error!("╔═══════════════════════════════════════════════════════╗");
36+
error!("║ 🚨 EMERGENCY CIRCUIT BREAKER 🚨 ║");
37+
error!("║ ROLLBACK DETECTED - HALTING NODE ║");
38+
error!("╚═══════════════════════════════════════════════════════╝");
39+
error!("");
40+
error!("Last finalized round: {}", last);
41+
error!("Current round: {}", current_round);
42+
error!("Rollback amount: {} rounds", last - current_round);
43+
error!("");
44+
error!("This indicates a critical consensus failure.");
45+
error!("The node is halting to prevent state corruption.");
46+
error!("");
47+
error!("MANUAL INTERVENTION REQUIRED:");
48+
error!("1. Check all validator logs");
49+
error!("2. Verify network state with other operators");
50+
error!("3. Determine root cause");
51+
error!("4. Coordinate recovery plan");
52+
error!("");
53+
error!("DO NOT RESTART without understanding the cause.");
54+
error!("");
55+
error!("Exit code 100 = circuit breaker triggered");
56+
57+
// HALT THE PROCESS
58+
std::process::exit(100);
59+
}
60+
61+
// Update last finalized
62+
self.last_finalized.store(current_round, Ordering::SeqCst);
63+
}
64+
65+
/// Check if round is advancing too slowly (possible stall)
66+
pub fn check_liveness(&self, current_round: u64, max_lag: u64) {
67+
if !self.enabled {
68+
return;
69+
}
70+
71+
let last = self.last_finalized.load(Ordering::SeqCst);
72+
73+
if last > 0 && current_round == last {
74+
// No progress - this is checked elsewhere
75+
return;
76+
}
77+
78+
// Check for large gaps (possible network partition)
79+
if current_round > last + max_lag {
80+
warn!("⚠️ Large finality gap detected: {} rounds", current_round - last);
81+
warn!("Possible network partition or synchronization issue");
82+
}
83+
}
84+
85+
/// Get the last finalized round
86+
pub fn last_finalized(&self) -> u64 {
87+
self.last_finalized.load(Ordering::SeqCst)
88+
}
89+
90+
/// Check if enabled
91+
pub fn is_enabled(&self) -> bool {
92+
self.enabled
93+
}
94+
}
95+
96+
impl Default for CircuitBreaker {
97+
fn default() -> Self {
98+
Self::new(true)
99+
}
100+
}
101+
102+
#[cfg(test)]
103+
mod tests {
104+
use super::*;
105+
106+
#[test]
107+
fn test_circuit_breaker_allows_forward() {
108+
let cb = CircuitBreaker::new(true);
109+
110+
cb.check_finality(1);
111+
cb.check_finality(2);
112+
cb.check_finality(3);
113+
114+
assert_eq!(cb.last_finalized(), 3);
115+
}
116+
117+
#[test]
118+
fn test_circuit_breaker_allows_same() {
119+
let cb = CircuitBreaker::new(true);
120+
121+
cb.check_finality(5);
122+
cb.check_finality(5);
123+
124+
assert_eq!(cb.last_finalized(), 5);
125+
}
126+
127+
#[test]
128+
fn test_circuit_breaker_disabled_allows_backward() {
129+
// When disabled, circuit breaker allows backward movement
130+
let cb = CircuitBreaker::new(false);
131+
132+
cb.check_finality(10);
133+
cb.check_finality(5); // Would halt if enabled, but disabled so OK
134+
135+
assert_eq!(cb.last_finalized(), 5);
136+
}
137+
138+
#[test]
139+
fn test_liveness_check() {
140+
let cb = CircuitBreaker::new(true);
141+
142+
cb.check_finality(100);
143+
cb.check_liveness(1100, 100); // 1000 round gap - should warn but not halt
144+
145+
// Test passes if no panic
146+
}
147+
148+
#[test]
149+
fn test_is_enabled() {
150+
let cb_enabled = CircuitBreaker::new(true);
151+
let cb_disabled = CircuitBreaker::new(false);
152+
153+
assert!(cb_enabled.is_enabled());
154+
assert!(!cb_disabled.is_enabled());
155+
}
156+
157+
// Note: Cannot test actual rollback halt in unit tests as it calls std::process::exit(100)
158+
// This must be tested in integration tests or manually
159+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pub mod circuit_breaker;

scripts/deploy_fly.sh

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/bin/bash
2+
# Deploy UltraDAG to Fly.io with safety checks
3+
# Usage: ./deploy_fly.sh [testnet|mainnet]
4+
5+
set -e # Exit on any error
6+
7+
NETWORK=${1:-testnet}
8+
9+
echo "🚀 UltraDAG Fly.io Deployment"
10+
echo "=============================="
11+
echo ""
12+
13+
# Set environment based on network
14+
if [ "$NETWORK" = "mainnet" ]; then
15+
export NETWORK="mainnet"
16+
export NETWORK_URL="https://ultradag-node-1.fly.dev"
17+
export DATA_DIR="/root/.ultradag/node"
18+
export DEPLOY_TARGET="fly.io"
19+
20+
echo "⚠️ MAINNET DEPLOYMENT"
21+
echo "This will deploy to production. All safety checks will be enforced."
22+
echo ""
23+
else
24+
export NETWORK="testnet"
25+
export NETWORK_URL="https://ultradag-node-1.fly.dev"
26+
export DATA_DIR="/root/.ultradag/node"
27+
export DEPLOY_TARGET="fly.io"
28+
29+
echo "📡 TESTNET DEPLOYMENT"
30+
echo ""
31+
fi
32+
33+
# Run pre-deployment safety checks
34+
echo "Running pre-deployment safety checks..."
35+
./scripts/pre_deploy_check.sh
36+
37+
if [ $? -ne 0 ]; then
38+
echo "❌ Pre-deployment checks failed. Aborting."
39+
exit 1
40+
fi
41+
42+
echo ""
43+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
44+
echo "Deploying to Fly.io..."
45+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
46+
echo ""
47+
48+
# Deploy to all nodes
49+
NODES=("ultradag-node-1" "ultradag-node-2" "ultradag-node-3" "ultradag-node-4")
50+
51+
for node in "${NODES[@]}"; do
52+
echo "Deploying to $node..."
53+
54+
if fly deploy --app "$node" --remote-only; then
55+
echo "$node deployed successfully"
56+
else
57+
echo "❌ Failed to deploy to $node"
58+
echo ""
59+
echo "Deployment failed. You may need to:"
60+
echo "1. Check Fly.io status"
61+
echo "2. Verify app configuration"
62+
echo "3. Roll back if needed: fly releases rollback --app $node"
63+
exit 1
64+
fi
65+
66+
echo ""
67+
done
68+
69+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
70+
echo "✅ Deployment complete"
71+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
72+
echo ""
73+
echo "Verifying deployment..."
74+
75+
# Wait for nodes to start
76+
sleep 10
77+
78+
# Check each node
79+
for node in "${NODES[@]}"; do
80+
URL="https://$node.fly.dev/status"
81+
echo -n "Checking $node... "
82+
83+
STATUS=$(curl -s --max-time 10 "$URL" 2>/dev/null)
84+
if [ -n "$STATUS" ]; then
85+
ROUND=$(echo "$STATUS" | jq -r '.dag_round' 2>/dev/null)
86+
if [ -n "$ROUND" ] && [ "$ROUND" != "null" ]; then
87+
echo "✅ Round $ROUND"
88+
else
89+
echo "⚠️ Responding but no round info"
90+
fi
91+
else
92+
echo "❌ Not responding"
93+
fi
94+
done
95+
96+
echo ""
97+
echo "Deployment verification complete."
98+
echo ""
99+
echo "Monitor with: ./scripts/monitor.sh"

scripts/extended_monitor.log

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,25 @@ NODE4:SUPPLY:2261550
4747
Node 4: round=1255 fin=1254 lag= 1 peers=11 supply=2,269,250 ✅
4848
Round 1254: 3 vertices ✅
4949

50+
[2026-03-08 13:25:25] Sample #3
51+
Node 1: round=1331 fin=1331 lag= 0 peers= 8 supply=2,282,700 ✅
52+
NODE1:ROUND:1331
53+
NODE1:SUPPLY:2282700
54+
Node 2: round=1331 fin=1331 lag= 0 peers= 7 supply=2,282,700 ✅
55+
NODE2:ROUND:1331
56+
NODE2:SUPPLY:2282700
57+
Node 3: round=1332 fin=1331 lag= 1 peers=12 supply=2,282,750 ✅
58+
NODE3:ROUND:1332
59+
NODE3:SUPPLY:2282750
60+
Node 4: round=1332 fin=1331 lag= 1 peers=11 supply=2,282,850 ✅
61+
NODE4:ROUND:1332
62+
NODE4:SUPPLY:2282850
63+
Round 1332: 3 vertices ✅
64+
65+
[2026-03-08 13:27:16] Sample #68
66+
Node 1: round=1376 fin=1375 lag= 1 peers= 8 supply=2,290,550 ✅
67+
Node 2: round=1376 fin=1375 lag= 1 peers= 7 supply=2,290,550 ✅
68+
Node 3: round=1376 fin=1375 lag= 1 peers=12 supply=2,290,550 ✅
69+
Node 4: round=1376 fin=1375 lag= 1 peers=11 supply=2,290,550 ✅
70+
Round 1376: 3 vertices ✅
71+

0 commit comments

Comments
 (0)