Add metrics to monitor cluster throughput

DiegoTavares · DiegoTavares · commit e7089bba52d5 · 2026-06-19T15:35:51.000-07:00
diff --git a/rust/crates/scheduler/src/cluster.rs b/rust/crates/scheduler/src/cluster.rs
@@ -33,7 +33,7 @@ use crate::{
     cluster_key::{Tag, TagType},
     config::CONFIG,
     dao::{helpers::parse_uuid, ClusterDao},
-    metrics::observe_cluster_round_trip,
+    metrics,
 };
 
 pub static CLUSTER_ROUNDS: AtomicUsize = AtomicUsize::new(0);
@@ -73,6 +73,20 @@ impl Cluster {
             tags: tags.into_iter().collect(),
         }
     }
+
+    /// Bounded metric label for the cluster's tag class. Clusters are built from
+    /// a single `TagType` (alloc clusters are one-tag; manual/hostname/hardware
+    /// clusters chunk tags of one type), so the first tag determines the class.
+    /// Returns a `&'static str` to keep Prometheus label cardinality at four.
+    pub fn cluster_type(&self) -> &'static str {
+        match self.tags.iter().next().map(|t| &t.ttype) {
+            Some(TagType::Alloc) => "alloc",
+            Some(TagType::Manual) => "manual",
+            Some(TagType::HostName) => "hostname",
+            Some(TagType::Hardware) => "hardware",
+            None => "unknown",
+        }
+    }
 }
 
 /// Inputs retained by a DB-backed [`ClusterFeed`] so it can periodically reload
@@ -629,11 +643,16 @@ impl ClusterFeed {
                             return ControlFlow::Break(());
                         }
                         let now = Instant::now();
+                        // Capture the type before `item` is moved into the map.
+                        let cluster_type = item.cluster_type();
                         let mut last_sent_lock = last_sent_map_producer
                             .lock()
                             .unwrap_or_else(|p| p.into_inner());
                         if let Some(prev) = last_sent_lock.insert(item, now) {
-                            observe_cluster_round_trip(now.duration_since(prev));
+                            metrics::observe_cluster_round_trip(
+                                cluster_type,
+                                now.duration_since(prev),
+                            );
                         }
                     } else if !completed_round {
                         // Skipped a sleeping cluster mid-round; yield so we don't starve the runtime.
@@ -650,6 +669,25 @@ impl ClusterFeed {
                                 sleep_map.lock().unwrap_or_else(|p| p.into_inner());
                             sleep_map_lock.len()
                         };
+
+                        // Sample fan-out gauges once per lap (cheap relative to a
+                        // full round-robin pass). CLUSTERS_TOTAL by type is the
+                        // primary fan-out signal; CLUSTERS_SLEEPING shows how much
+                        // of the set is backed off at any instant.
+                        {
+                            let mut by_type: HashMap<&'static str, i64> = HashMap::new();
+                            {
+                                let clusters =
+                                    feed.read().unwrap_or_else(|p| p.into_inner());
+                                for c in clusters.iter() {
+                                    *by_type.entry(c.cluster_type()).or_default() += 1;
+                                }
+                            }
+                            for (cluster_type, count) in by_type {
+                                metrics::set_clusters_total(cluster_type, count);
+                            }
+                            metrics::set_clusters_sleeping(sleeping_count as i64);
+                        }
                         if sleeping_count >= cluster_size {
                             // Ensure this doesn't loop forever when there's a limit configured
                             all_sleeping_rounds += 1;
diff --git a/rust/crates/scheduler/src/metrics/mod.rs b/rust/crates/scheduler/src/metrics/mod.rs
@@ -13,8 +13,9 @@
 use axum::{response::IntoResponse, routing::get, Router};
 use lazy_static::lazy_static;
 use prometheus::{
-    register_counter, register_counter_vec, register_histogram, Counter, CounterVec, Encoder,
-    Histogram, TextEncoder,
+    register_counter, register_counter_vec, register_gauge, register_gauge_vec,
+    register_histogram, register_histogram_vec, Counter, CounterVec, Encoder, Gauge, GaugeVec,
+    Histogram, HistogramVec, TextEncoder,
 };
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::time::Duration;
@@ -87,13 +88,89 @@ lazy_static! {
     .expect("Failed to register job_query_duration_seconds histogram");
 
     // Cluster feed metrics from cluster.rs
-    pub static ref CLUSTER_ROUND_TRIP_SECONDS: Histogram = register_histogram!(
+    //
+    // Labeled by `cluster_type` (alloc / manual / hostname / hardware) so the
+    // round-trip tail can be attributed to a tag class. A large fan-out of
+    // chunked manual-tag clusters lengthens the round-robin lap and shows up
+    // here as a worse tail on the `manual` series than on `alloc`. The label is
+    // bounded to the four `TagType` variants — never per-tag, which would blow
+    // up cardinality on farms with thousands of tags.
+    pub static ref CLUSTER_ROUND_TRIP_SECONDS: HistogramVec = register_histogram_vec!(
         "scheduler_cluster_round_trip_seconds",
-        "Time between successive emissions of the same active (non-sleeping) cluster",
+        "Time between successive emissions of the same active (non-sleeping) cluster, by cluster type",
+        &["cluster_type"],
         vec![0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0]
     )
     .expect("Failed to register cluster_round_trip_seconds histogram");
 
+    // Size of the live cluster set, by tag class, sampled once per round-robin
+    // lap by the feed producer. This is the fan-out magnitude: with
+    // `manual_tags_chunk_size = N` and T manual tags, the `manual` series is
+    // ~T/N. A large value here is the prime suspect for slow per-cluster
+    // revisits (each cluster only gets a turn once per full lap).
+    pub static ref CLUSTERS_TOTAL: GaugeVec = register_gauge_vec!(
+        "scheduler_clusters_total",
+        "Number of clusters in the live feed set, by cluster type",
+        &["cluster_type"]
+    )
+    .expect("Failed to register clusters_total gauge");
+
+    // Clusters currently sleeping (skipped this lap), sampled once per lap.
+    // Sleeping clusters are invisible to dispatch until their backoff expires
+    // (`cluster_empty_sleep` / `cluster_saturated_sleep`). A high value relative
+    // to CLUSTERS_TOTAL means most of the set is backed off at any instant.
+    pub static ref CLUSTERS_SLEEPING: Gauge = register_gauge!(
+        "scheduler_clusters_sleeping",
+        "Number of clusters currently sleeping (skipped each lap)"
+    )
+    .expect("Failed to register clusters_sleeping gauge");
+
+    // Frames booked in a single cluster pass (one emission off the feed),
+    // labeled by `cluster_type`. If this pins at the per-pass ceiling
+    // (`max_jobs_per_cluster_pass` x `dispatch_frames_per_layer_limit`) while a
+    // backlog persists, throughput-per-turn x infrequent-turns explains a
+    // growing queue: the farm has capacity but each cluster only drains a
+    // bounded slice per visit.
+    pub static ref FRAMES_DISPATCHED_PER_PASS: HistogramVec = register_histogram_vec!(
+        "scheduler_frames_dispatched_per_pass",
+        "Frames dispatched in a single cluster pass, by cluster type",
+        &["cluster_type"],
+        vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 250.0, 500.0]
+    )
+    .expect("Failed to register frames_dispatched_per_pass histogram");
+
+    // Why each cluster pass ended, labeled by `reason`:
+    //   booked       - placed at least one frame
+    //   saturated    - jobs were pending but nothing fit (farm full or gated)
+    //   no_jobs      - the job query returned nothing eligible
+    //   query_error  - the job query failed (pass backed off and will retry)
+    // `saturated` dominating while hosts sit idle points at matching/tagging,
+    // not capacity. `no_jobs` dominating means the cluster set is mostly empty
+    // churn and the round-robin lap is paying for clusters with no work.
+    pub static ref PASS_TERMINATED_REASON_TOTAL: CounterVec = register_counter_vec!(
+        "scheduler_pass_terminated_reason_total",
+        "Cluster passes by terminal reason",
+        &["reason"]
+    )
+    .expect("Failed to register pass_terminated_reason_total counter");
+
+    // Outcome of each host-checkout attempt in the matcher loop, labeled by
+    // `outcome`:
+    //   booked          - a host was checked out and the frame dispatched
+    //   no_match        - check_out returned NoCandidateAvailable
+    //   dispatch_error  - host checked out but the dispatch transaction failed
+    // The reported "hosts available but not booked" symptom shows up as a high
+    // `no_match` rate; cross-reference with CLUSTERS_TOTAL / round-trip to tell
+    // a genuine no-fit from a starved cluster that simply isn't getting visited.
+    // A finer no_match breakdown (reserved / cas_lost / gate_rejected) lives
+    // inside host_cache and is deferred to a later tier.
+    pub static ref CHECKOUT_OUTCOME_TOTAL: CounterVec = register_counter_vec!(
+        "scheduler_checkout_outcome_total",
+        "Host checkout attempts by outcome",
+        &["outcome"]
+    )
+    .expect("Failed to register checkout_outcome_total counter");
+
     // E-PVM placement metrics from host_cache/cache.rs. Observed only on the
     // Epvm path (Saturation always scores 0.0). Buckets are dimensionless W3
     // fractional-layer-frames units; calibration may need adjustment after
@@ -255,10 +332,51 @@ pub fn observe_job_query_duration(duration: Duration) {
     JOB_QUERY_DURATION_SECONDS.observe(duration.as_secs_f64());
 }
 
-/// Helper function to observe cluster round-trip duration
+/// Helper function to observe cluster round-trip duration, labeled by cluster type.
+#[inline]
+pub fn observe_cluster_round_trip(cluster_type: &str, duration: Duration) {
+    CLUSTER_ROUND_TRIP_SECONDS
+        .with_label_values(&[cluster_type])
+        .observe(duration.as_secs_f64());
+}
+
+/// Sets the live cluster-set size for a given cluster type. Sampled once per
+/// round-robin lap by the feed producer.
+#[inline]
+pub fn set_clusters_total(cluster_type: &str, count: i64) {
+    CLUSTERS_TOTAL
+        .with_label_values(&[cluster_type])
+        .set(count as f64);
+}
+
+/// Sets the number of clusters currently sleeping. Sampled once per lap.
+#[inline]
+pub fn set_clusters_sleeping(count: i64) {
+    CLUSTERS_SLEEPING.set(count as f64);
+}
+
+/// Records the number of frames booked in a single cluster pass.
+#[inline]
+pub fn observe_frames_dispatched_per_pass(cluster_type: &str, frames: usize) {
+    FRAMES_DISPATCHED_PER_PASS
+        .with_label_values(&[cluster_type])
+        .observe(frames as f64);
+}
+
+/// Records the terminal reason for a cluster pass
+/// (`booked` / `saturated` / `no_jobs` / `query_error`).
+#[inline]
+pub fn increment_pass_terminated_reason(reason: &str) {
+    PASS_TERMINATED_REASON_TOTAL
+        .with_label_values(&[reason])
+        .inc();
+}
+
+/// Records the outcome of a single host-checkout attempt
+/// (`booked` / `no_match` / `dispatch_error`).
 #[inline]
-pub fn observe_cluster_round_trip(duration: Duration) {
-    CLUSTER_ROUND_TRIP_SECONDS.observe(duration.as_secs_f64());
+pub fn increment_checkout_outcome(outcome: &str) {
+    CHECKOUT_OUTCOME_TOTAL.with_label_values(&[outcome]).inc();
 }
 
 /// Records the E-PVM score of the host returned by `check_out_best`.
diff --git a/rust/crates/scheduler/src/pipeline/entrypoint.rs b/rust/crates/scheduler/src/pipeline/entrypoint.rs
@@ -70,6 +70,9 @@ pub async fn run(cluster_feed: ClusterFeed) -> miette::Result<()> {
             let feed_sender = feed_sender.clone();
 
             async move {
+                // Bounded metric label captured before `cluster` is moved into a
+                // FeedMessage::Sleep below.
+                let cluster_type = cluster.cluster_type();
                 let jobs = job_fetcher
                     .query_pending_jobs_by_show_facility_and_tags(
                         cluster.show_id,
@@ -98,16 +101,33 @@ pub async fn run(cluster_feed: ClusterFeed) -> miette::Result<()> {
                                 },
                             )
                             .await;
+                        let processed = processed_jobs.load(Ordering::Relaxed);
+                        let dispatched = dispatched_frames.load(Ordering::Relaxed);
+
+                        // Per-pass yield and terminal reason. Together these show
+                        // whether each cluster turn drains a bounded slice while a
+                        // backlog persists (cap-limited) versus genuinely finding
+                        // no work or no fit.
+                        metrics::observe_frames_dispatched_per_pass(cluster_type, dispatched);
+                        let reason = if processed == 0 {
+                            "no_jobs"
+                        } else if dispatched == 0 {
+                            "saturated"
+                        } else {
+                            "booked"
+                        };
+                        metrics::increment_pass_terminated_reason(reason);
+
                         // If no jobs got processed, sleep to prevent hammering the database with
                         // queries with no outcome
-                        if processed_jobs.load(Ordering::Relaxed) == 0 {
+                        if processed == 0 {
                             let _ = feed_sender
                                 .send(FeedMessage::Sleep(
                                     cluster,
                                     CONFIG.queue.cluster_empty_sleep,
                                 ))
                                 .await;
-                        } else if dispatched_frames.load(Ordering::Relaxed) == 0 {
+                        } else if dispatched == 0 {
                             // Jobs are pending but the whole pass placed nothing —
                             // typically a saturated farm (no host fits anywhere).
                             // Without a back-off this cluster would re-query its
@@ -144,6 +164,7 @@ pub async fn run(cluster_feed: ClusterFeed) -> miette::Result<()> {
                         // would shut the scheduler down on the first hiccup; back
                         // this cluster off instead and let the next pass retry.
                         error!("Failed to fetch jobs for cluster {}: {}", cluster, err);
+                        metrics::increment_pass_terminated_reason("query_error");
                         let _ = feed_sender
                             .send(FeedMessage::Sleep(
                                 cluster,
diff --git a/rust/crates/scheduler/src/pipeline/matcher.rs b/rust/crates/scheduler/src/pipeline/matcher.rs
@@ -443,6 +443,7 @@ impl MatchingService {
                             updated_host,
                             updated_layer,
                         }) => {
+                            metrics::increment_checkout_outcome("booked");
                             // Track cores actually consumed so the next iteration's
                             // LayerProfile sees the local picture of usage. The same
                             // delta applies to the (show, alloc) subscription burst
@@ -479,6 +480,7 @@ impl MatchingService {
                             }
                         }
                         Err(err) => {
+                            metrics::increment_checkout_outcome("dispatch_error");
                             // On error, we lost the layer since it was moved to DispatchLayerMessage
                             // This means we can't continue with this layer
                             Self::log_dispatch_error_with_info(
@@ -504,6 +506,7 @@ impl MatchingService {
 
                     match err {
                         crate::host_cache::HostCacheError::NoCandidateAvailable => {
+                            metrics::increment_checkout_outcome("no_match");
                             debug!(
                                 "No host candidate available for layer {}. {:?}",
                                 current_layer_version.as_ref().unwrap(),