Skip to content

Commit c112143

Browse files
committed
feat(relay): expose script health telemetry
Add a read-only per-deployment health snapshot over the existing relay state so operators can inspect how deployment selection is behaving without changing the scheduler itself. The snapshot reports masked script IDs, locally observed rolling quota usage, the configured local quota threshold, saturation state, active cooldown seconds, cooldown reason, and timeout strike count. Cooldown reasons are tracked alongside the existing blacklist timestamps and are pruned whenever expired blacklist entries are removed. Surface the snapshot in the desktop UI as a collapsible Script health table, clear stale rows when the proxy stops or exits, and document that these values are local client observations rather than authoritative Google-side quota counters. Add focused unit coverage for quota saturation, cooldown reason exposure, timeout strike visibility, and compact duration formatting. The relay routing, quarantine durations, and selection behavior remain unchanged.
1 parent dd603a1 commit c112143

4 files changed

Lines changed: 223 additions & 1 deletion

File tree

docs/guide.fa.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,8 @@ HTTP / HTTPS مثل قبل از Apps Script می‌رود (تغییری نمی
227227

228228
بیشتر Deployment = همزمانی بیشتر = تأخیر کمتر هر سشن. انتخاب هر بَچ از بین IDهای تنظیم‌شده با یک ledger محلی rolling 24-hour انجام می‌شود؛ بار پخش می‌شود و کلاینت از Deploymentهایی که همین دستگاه نزدیک سقف request سهمیهٔ رایگان برده دوری می‌کند.
229229

230+
پنل **Script health** در UI دسکتاپ همین وضعیت محلی را فقط به‌صورت read-only نشان می‌دهد: Deployment ID ماسک‌شده، تعداد callهای مشاهده‌شده در پنجرهٔ rolling 24-hour، اینکه threshold محلی free-tier اشباع شده یا نه، cooldown باقی‌مانده، دلیل/کلاس خطایی که آن cooldown را ساخته، و تعداد timeout strikeهای فعلی. این فقط telemetry سمت کلاینت است؛ اگر دستگاه‌های دیگر هم از همان deployment استفاده کنند، Google ممکن است callهای بیشتری شمرده باشد.
231+
230232
**محافظ‌های منابع:**
231233
- **حداکثر ۵۰ op** در هر بَچ — اگر سشن‌های فعال بیشتر باشند، مالتی‌پلکسر چند بَچ می‌فرستد
232234
- **سقف payload ۴ مگابایت** در هر بَچ — خیلی کمتر از ۵۰ مگابایت Apps Script

docs/guide.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,8 @@ max_concurrent = 30 × number_of_deployment_ids
227227

228228
More deployments = more total concurrency = lower per-session latency. Each batch is selected from the configured IDs with a local rolling 24-hour ledger, spreading load and steering away from deployments this client has already driven near the free-tier request budget.
229229

230+
The desktop **Script health** panel shows this local state without changing routing behavior: masked deployment ID, locally observed calls inside the rolling 24-hour window, whether the local free-tier steering threshold is saturated, any remaining cooldown, the failure class/reason that set that cooldown, and the current timeout-strike count. Treat it as client-side telemetry only; Google may also count requests from other devices using the same deployment.
231+
230232
**Resource guards:**
231233
- **50 ops max** per batch — if more sessions are active, the mux splits into multiple batches
232234
- **4 MB payload cap** per batch — well under Apps Script's 50 MB limit

src/bin/ui.rs

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ struct UiState {
117117
running: bool,
118118
started_at: Option<Instant>,
119119
last_stats: Option<mhrv_rs::domain_fronter::StatsSnapshot>,
120+
last_script_health: Vec<mhrv_rs::domain_fronter::ScriptHealthSnapshot>,
120121
last_per_site: Vec<(String, mhrv_rs::domain_fronter::HostStat)>,
121122
log: VecDeque<String>,
122123
/// Result + timestamp for transient status banners (auto-hide after 10s).
@@ -1156,7 +1157,7 @@ impl eframe::App for App {
11561157
ui.add_space(8.0);
11571158

11581159
// ── Status + stats card ────────────────────────────────────────
1159-
let (running, started_at, stats, ca_trusted, last_test_msg, per_site) = {
1160+
let (running, started_at, stats, ca_trusted, last_test_msg, per_site, script_health) = {
11601161
let s = self.shared.state.lock().unwrap();
11611162
(
11621163
s.running,
@@ -1165,6 +1166,7 @@ impl eframe::App for App {
11651166
s.ca_trusted,
11661167
s.last_test_msg.clone(),
11671168
s.last_per_site.clone(),
1169+
s.last_script_health.clone(),
11681170
)
11691171
};
11701172

@@ -1318,6 +1320,66 @@ impl eframe::App for App {
13181320
});
13191321
}
13201322

1323+
if !script_health.is_empty() {
1324+
ui.add_space(2.0);
1325+
egui::CollapsingHeader::new(format!(
1326+
"Script health ({} deployments)",
1327+
script_health.len()
1328+
))
1329+
.default_open(false)
1330+
.show(ui, |ui| {
1331+
egui::ScrollArea::vertical()
1332+
.max_height(160.0)
1333+
.show(ui, |ui| {
1334+
egui::Grid::new("script_health")
1335+
.num_columns(5)
1336+
.spacing([8.0, 2.0])
1337+
.striped(true)
1338+
.show(ui, |ui| {
1339+
ui.label(egui::RichText::new("script").strong());
1340+
ui.label(egui::RichText::new("quota").strong());
1341+
ui.label(egui::RichText::new("cooldown").strong());
1342+
ui.label(egui::RichText::new("reason").strong());
1343+
ui.label(egui::RichText::new("timeouts").strong());
1344+
ui.end_row();
1345+
1346+
for st in &script_health {
1347+
let quota = format!(
1348+
"{} / {}{}",
1349+
st.quota_used,
1350+
st.quota_limit,
1351+
if st.quota_saturated { " saturated" } else { "" }
1352+
);
1353+
let cooldown = st
1354+
.cooldown_secs
1355+
.map(fmt_seconds_compact)
1356+
.unwrap_or_else(|| "-".to_string());
1357+
let reason = st
1358+
.cooldown_reason
1359+
.as_deref()
1360+
.unwrap_or("-")
1361+
.to_string();
1362+
ui.label(egui::RichText::new(&st.script_id).monospace());
1363+
ui.label(egui::RichText::new(quota).monospace());
1364+
ui.label(egui::RichText::new(cooldown).monospace());
1365+
ui.label(egui::RichText::new(reason).small());
1366+
ui.label(
1367+
egui::RichText::new(st.timeout_strikes.to_string())
1368+
.monospace(),
1369+
);
1370+
ui.end_row();
1371+
}
1372+
});
1373+
});
1374+
ui.small(
1375+
egui::RichText::new(
1376+
"Local view only: Google quota can also be consumed by other clients using the same deployment.",
1377+
)
1378+
.color(egui::Color32::from_gray(130)),
1379+
);
1380+
});
1381+
}
1382+
13211383
if !per_site.is_empty() {
13221384
ui.add_space(2.0);
13231385
egui::CollapsingHeader::new(format!("Per-site ({} hosts)", per_site.len()))
@@ -1949,6 +2011,16 @@ fn fmt_duration(d: Duration) -> String {
19492011
format!("{:02}:{:02}:{:02}", s / 3600, (s / 60) % 60, s % 60)
19502012
}
19512013

2014+
fn fmt_seconds_compact(seconds: u64) -> String {
2015+
if seconds >= 3600 {
2016+
format!("{}h {}m", seconds / 3600, (seconds / 60) % 60)
2017+
} else if seconds >= 60 {
2018+
format!("{}m {}s", seconds / 60, seconds % 60)
2019+
} else {
2020+
format!("{}s", seconds)
2021+
}
2022+
}
2023+
19522024
fn fmt_bytes(b: u64) -> String {
19532025
const K: u64 = 1024;
19542026
const M: u64 = K * K;
@@ -1986,9 +2058,11 @@ fn background_thread(shared: Arc<Shared>, rx: Receiver<Cmd>) {
19862058
if let Some(fronter) = f.as_ref() {
19872059
let s = fronter.snapshot_stats();
19882060
let per_site = fronter.snapshot_per_site();
2061+
let script_health = fronter.snapshot_script_health();
19892062
let mut st = shared.state.lock().unwrap();
19902063
st.last_stats = Some(s);
19912064
st.last_per_site = per_site;
2065+
st.last_script_health = script_health;
19922066
}
19932067
});
19942068
}
@@ -2064,6 +2138,7 @@ fn background_thread(shared: Arc<Shared>, rx: Receiver<Cmd>) {
20642138
// or normal shutdown without Cmd::Stop). The
20652139
// Stop handler clears this too — either is fine.
20662140
st.proxy_active = false;
2141+
st.last_script_health.clear();
20672142
}
20682143
push_log(&shared2, "[ui] proxy stopped");
20692144
});
@@ -2094,6 +2169,7 @@ fn background_thread(shared: Arc<Shared>, rx: Receiver<Cmd>) {
20942169
st.running = false;
20952170
st.started_at = None;
20962171
st.proxy_active = false;
2172+
st.last_script_health.clear();
20972173
}
20982174
}
20992175

@@ -2568,3 +2644,15 @@ fn push_log(shared: &Shared, msg: &str) {
25682644
s.log.pop_front();
25692645
}
25702646
}
2647+
2648+
#[cfg(test)]
2649+
mod tests {
2650+
use super::fmt_seconds_compact;
2651+
2652+
#[test]
2653+
fn compact_seconds_formatter_scales_units() {
2654+
assert_eq!(fmt_seconds_compact(9), "9s");
2655+
assert_eq!(fmt_seconds_compact(125), "2m 5s");
2656+
assert_eq!(fmt_seconds_compact(3660), "1h 1m");
2657+
}
2658+
}

src/domain_fronter.rs

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,10 @@ pub struct DomainFronter {
363363
inflight: Arc<Mutex<HashMap<String, broadcast::Sender<Vec<u8>>>>>,
364364
coalesced: AtomicU64,
365365
blacklist: Arc<std::sync::Mutex<HashMap<String, Instant>>>,
366+
/// Human-readable cooldown reason keyed by script ID. Kept separate
367+
/// from `blacklist` so the selection hot path still only checks the
368+
/// timestamp map while diagnostics can explain the active cooldown.
369+
script_cooldown_reasons: Arc<std::sync::Mutex<HashMap<String, String>>>,
366370
/// Per-deployment local call ledger used by `next_script_id` /
367371
/// `next_script_ids` to avoid selecting an already saturated deployment
368372
/// while another configured script still has locally-observed capacity.
@@ -654,6 +658,7 @@ impl DomainFronter {
654658
inflight: Arc::new(Mutex::new(HashMap::new())),
655659
coalesced: AtomicU64::new(0),
656660
blacklist: Arc::new(std::sync::Mutex::new(HashMap::new())),
661+
script_cooldown_reasons: Arc::new(std::sync::Mutex::new(HashMap::new())),
657662
script_quota_ledger: Arc::new(std::sync::Mutex::new(HashMap::new())),
658663
script_timeouts: Arc::new(std::sync::Mutex::new(HashMap::new())),
659664
relay_calls: AtomicU64::new(0),
@@ -816,6 +821,43 @@ impl DomainFronter {
816821
}
817822
}
818823

824+
pub fn snapshot_script_health(&self) -> Vec<ScriptHealthSnapshot> {
825+
let now = Instant::now();
826+
let bl = self.blacklist.lock().unwrap();
827+
let reasons = self.script_cooldown_reasons.lock().unwrap();
828+
let quota = self.script_quota_ledger.lock().unwrap();
829+
let timeouts = self.script_timeouts.lock().unwrap();
830+
831+
self.script_ids
832+
.iter()
833+
.map(|sid| {
834+
let quota_used = quota
835+
.get(sid)
836+
.map(|calls| {
837+
calls
838+
.iter()
839+
.filter(|at| now.saturating_duration_since(**at) < SCRIPT_QUOTA_WINDOW)
840+
.count()
841+
})
842+
.unwrap_or(0);
843+
let cooldown_secs = bl
844+
.get(sid)
845+
.map(|until| until.saturating_duration_since(now).as_secs())
846+
.filter(|secs| *secs > 0);
847+
let timeout_strikes = timeouts.get(sid).map(|(_, strikes)| *strikes).unwrap_or(0);
848+
ScriptHealthSnapshot {
849+
script_id: mask_script_id(sid),
850+
quota_used,
851+
quota_limit: SCRIPT_QUOTA_FREE_TIER_CALLS,
852+
quota_saturated: quota_used >= SCRIPT_QUOTA_FREE_TIER_CALLS,
853+
cooldown_secs,
854+
cooldown_reason: reasons.get(sid).cloned(),
855+
timeout_strikes,
856+
}
857+
})
858+
.collect()
859+
}
860+
819861
pub fn num_scripts(&self) -> usize {
820862
self.script_ids.len()
821863
}
@@ -837,6 +879,10 @@ impl DomainFronter {
837879
let mut bl = self.blacklist.lock().unwrap();
838880
let now = Instant::now();
839881
bl.retain(|_, until| *until > now);
882+
self.script_cooldown_reasons
883+
.lock()
884+
.unwrap()
885+
.retain(|sid, _| bl.contains_key(sid));
840886
let mut quota = self.script_quota_ledger.lock().unwrap();
841887
prune_script_quota_ledger(&mut quota, now);
842888

@@ -864,6 +910,7 @@ impl DomainFronter {
864910
if let Some((sid, _)) = bl.iter().min_by_key(|(_, t)| **t) {
865911
let sid = sid.clone();
866912
bl.remove(&sid);
913+
self.script_cooldown_reasons.lock().unwrap().remove(&sid);
867914
record_script_quota_call_locked(&mut quota, &sid, now);
868915
return sid;
869916
}
@@ -884,6 +931,10 @@ impl DomainFronter {
884931
let mut bl = self.blacklist.lock().unwrap();
885932
let now = Instant::now();
886933
bl.retain(|_, until| *until > now);
934+
self.script_cooldown_reasons
935+
.lock()
936+
.unwrap()
937+
.retain(|sid, _| bl.contains_key(sid));
887938
let mut quota = self.script_quota_ledger.lock().unwrap();
888939
prune_script_quota_ledger(&mut quota, now);
889940

@@ -928,6 +979,10 @@ impl DomainFronter {
928979
let until = Instant::now() + cooldown;
929980
let mut bl = self.blacklist.lock().unwrap();
930981
bl.insert(script_id.to_string(), until);
982+
self.script_cooldown_reasons
983+
.lock()
984+
.unwrap()
985+
.insert(script_id.to_string(), reason.to_string());
931986
tracing::warn!(
932987
"blacklisted script {} for {}s: {}",
933988
mask_script_id(script_id),
@@ -4946,6 +5001,24 @@ pub struct StatsSnapshot {
49465001
pub h2_disabled: bool,
49475002
}
49485003

5004+
#[derive(Debug, Clone, PartialEq, Eq)]
5005+
pub struct ScriptHealthSnapshot {
5006+
/// Masked deployment ID (`prefix...suffix`) safe to render in logs/UI.
5007+
pub script_id: String,
5008+
/// Locally observed calls inside the rolling 24-hour steering window.
5009+
pub quota_used: usize,
5010+
/// Local free-tier steering threshold. This is not an authoritative
5011+
/// Google-side quota read; it is the client-side selection guard.
5012+
pub quota_limit: usize,
5013+
pub quota_saturated: bool,
5014+
/// Remaining local cooldown, if the deployment is currently sidelined.
5015+
pub cooldown_secs: Option<u64>,
5016+
/// Human-readable failure class/reason used when the cooldown was set.
5017+
pub cooldown_reason: Option<String>,
5018+
/// Current timeout strikes inside the auto-blacklist rolling window.
5019+
pub timeout_strikes: u32,
5020+
}
5021+
49495022
impl StatsSnapshot {
49505023
pub fn hit_rate(&self) -> f64 {
49515024
let total = self.cache_hits + self.cache_misses;
@@ -5183,6 +5256,15 @@ mod tests {
51835256
use std::task::{Context, Poll};
51845257
use tokio::io::{duplex, AsyncRead, AsyncWriteExt, ReadBuf};
51855258

5259+
fn find_script_health<'a>(
5260+
rows: &'a [ScriptHealthSnapshot],
5261+
masked_script_id: &str,
5262+
) -> &'a ScriptHealthSnapshot {
5263+
rows.iter()
5264+
.find(|row| row.script_id == masked_script_id)
5265+
.expect("script health row must exist")
5266+
}
5267+
51865268
// Test fixture for ungraceful TLS close: emit a fixed prefix of bytes
51875269
// then return io::ErrorKind::UnexpectedEof on the next read. Mirrors
51885270
// what rustls surfaces when the peer closes TCP without sending a
@@ -6536,6 +6618,54 @@ hello";
65366618
);
65376619
}
65386620

6621+
#[test]
6622+
fn script_health_snapshot_exposes_quota_and_cooldown_state() {
6623+
const SCRIPT_A: &str = "AKfycbxScriptHealthAlpha001";
6624+
const SCRIPT_B: &str = "AKfycbxScriptHealthBravo002";
6625+
let fronter = fronter_for_script_ids(&[SCRIPT_A, SCRIPT_B]);
6626+
let now = Instant::now();
6627+
seed_script_quota(&fronter, SCRIPT_A, 3, now);
6628+
fronter.blacklist_script_for(
6629+
SCRIPT_A,
6630+
Duration::from_secs(600),
6631+
"transient relay cooldown: HTTP 502",
6632+
);
6633+
fronter.record_timeout_strike(SCRIPT_B);
6634+
6635+
let rows = fronter.snapshot_script_health();
6636+
let a = find_script_health(&rows, &mask_script_id(SCRIPT_A));
6637+
let b = find_script_health(&rows, &mask_script_id(SCRIPT_B));
6638+
6639+
assert_eq!(a.quota_used, 3);
6640+
assert_eq!(a.quota_limit, SCRIPT_QUOTA_FREE_TIER_CALLS);
6641+
assert!(!a.quota_saturated);
6642+
assert!(a.cooldown_secs.is_some_and(|secs| secs <= 600 && secs > 0));
6643+
assert_eq!(
6644+
a.cooldown_reason.as_deref(),
6645+
Some("transient relay cooldown: HTTP 502")
6646+
);
6647+
assert_eq!(b.timeout_strikes, 1);
6648+
assert!(b.cooldown_secs.is_none());
6649+
}
6650+
6651+
#[test]
6652+
fn script_health_snapshot_marks_local_quota_saturation() {
6653+
const SCRIPT_A: &str = "AKfycbxScriptHealthAlpha001";
6654+
let fronter = fronter_for_script_ids(&[SCRIPT_A]);
6655+
seed_script_quota(
6656+
&fronter,
6657+
SCRIPT_A,
6658+
SCRIPT_QUOTA_FREE_TIER_CALLS,
6659+
Instant::now(),
6660+
);
6661+
6662+
let rows = fronter.snapshot_script_health();
6663+
let row = find_script_health(&rows, &mask_script_id(SCRIPT_A));
6664+
6665+
assert_eq!(row.quota_used, SCRIPT_QUOTA_FREE_TIER_CALLS);
6666+
assert!(row.quota_saturated);
6667+
}
6668+
65396669
#[test]
65406670
fn mask_script_id_hides_middle() {
65416671
assert_eq!(mask_script_id("short"), "***");

0 commit comments

Comments
 (0)