From c3323be1684444085a301a92b7f3927118b6b87b Mon Sep 17 00:00:00 2001
From: Chad Nehemiah <chad.nehemiah94@gmail.com>
Date: Thu, 29 May 2025 04:07:45 -0500
Subject: [PATCH 01/13] fix: ensure client errors are correctly tracked (#635)

* fix: ensure client errors are correctly tracked

* chore: update error tracking

* chore: adjust clippy

* chore: grammatical error
---
 .../src/handlers/chat_completions.rs          | 56 ++++++++++++--
 atoma-service/src/handlers/completions.rs     | 55 ++++++++++++--
 atoma-service/src/handlers/embeddings.rs      | 58 +++++++++++---
 .../src/handlers/image_generations.rs         | 70 +++++++++++++----
 atoma-service/src/handlers/metrics.rs         | 75 +++++++++++++++++++
 5 files changed, 279 insertions(+), 35 deletions(-)

diff --git a/atoma-service/src/handlers/chat_completions.rs b/atoma-service/src/handlers/chat_completions.rs
index eb320143..7a2815d0 100644
--- a/atoma-service/src/handlers/chat_completions.rs
+++ b/atoma-service/src/handlers/chat_completions.rs
@@ -3,7 +3,9 @@ use crate::{
         handle_concurrent_requests_count_decrement,
         metrics::{
             CHAT_COMPLETIONS_CONFIDENTIAL_NUM_REQUESTS, CHAT_COMPLETIONS_ESTIMATED_TOTAL_TOKENS,
-            TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS, TOTAL_FAILED_CHAT_REQUESTS,
+            TOTAL_BAD_REQUESTS, TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS,
+            TOTAL_FAILED_CHAT_REQUESTS, TOTAL_LOCKED_REQUESTS, TOTAL_TOO_EARLY_REQUESTS,
+            TOTAL_TOO_MANY_REQUESTS, TOTAL_UNAUTHORIZED_REQUESTS,
         },
         sign_response_and_update_stack_hash, update_fiat_amount, update_stack_num_compute_units,
     },
@@ -266,8 +268,30 @@ pub async fn chat_completions_handler(
             Ok(response)
         }
         Err(e) => {
-            TOTAL_FAILED_CHAT_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
-            TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+            match e.status_code() {
+                StatusCode::TOO_MANY_REQUESTS => {
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::BAD_REQUEST => {
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::LOCKED => {
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::TOO_EARLY => {
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::UNAUTHORIZED => {
+                    TOTAL_UNAUTHORIZED_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                _ => {
+                    TOTAL_FAILED_CHAT_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+            }
+
             // NOTE: We need to update the stack number of tokens as the service failed to generate
             // a proper response. For this reason, we set the total number of tokens to 0.
             // This will ensure that the stack number of tokens is not updated, and the stack
@@ -476,9 +500,29 @@ pub async fn confidential_chat_completions_handler(
             Ok(response)
         }
         Err(e) => {
-            TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS
-                .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
-            TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+            match e.status_code() {
+                StatusCode::TOO_MANY_REQUESTS => {
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::BAD_REQUEST => {
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::LOCKED => {
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::TOO_EARLY => {
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::UNAUTHORIZED => {
+                    TOTAL_UNAUTHORIZED_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                _ => {
+                    TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+            }
             // NOTE: We need to update the stack number of tokens as the service failed to generate
             // a proper response. For this reason, we set the total number of tokens to 0.
             // This will ensure that the stack number of tokens is not updated, and the stack
diff --git a/atoma-service/src/handlers/completions.rs b/atoma-service/src/handlers/completions.rs
index fac65537..268d4e4e 100644
--- a/atoma-service/src/handlers/completions.rs
+++ b/atoma-service/src/handlers/completions.rs
@@ -3,7 +3,9 @@ use crate::{
         handle_concurrent_requests_count_decrement,
         metrics::{
             CHAT_COMPLETIONS_CONFIDENTIAL_NUM_REQUESTS, CHAT_COMPLETIONS_ESTIMATED_TOTAL_TOKENS,
-            TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS, TOTAL_FAILED_CHAT_REQUESTS,
+            TOTAL_BAD_REQUESTS, TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS,
+            TOTAL_FAILED_CHAT_REQUESTS, TOTAL_LOCKED_REQUESTS, TOTAL_TOO_EARLY_REQUESTS,
+            TOTAL_TOO_MANY_REQUESTS, TOTAL_UNAUTHORIZED_REQUESTS,
         },
         sign_response_and_update_stack_hash, update_fiat_amount, update_stack_num_compute_units,
     },
@@ -240,8 +242,29 @@ pub async fn completions_handler(
             Ok(response)
         }
         Err(e) => {
-            TOTAL_FAILED_CHAT_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
-            TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+            match e.status_code() {
+                StatusCode::TOO_MANY_REQUESTS => {
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::BAD_REQUEST => {
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::LOCKED => {
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::TOO_EARLY => {
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::UNAUTHORIZED => {
+                    TOTAL_UNAUTHORIZED_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                _ => {
+                    TOTAL_FAILED_CHAT_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+            }
             // NOTE: We need to update the stack number of tokens as the service failed to generate
             // a proper response. For this reason, we set the total number of tokens to 0.
             // This will ensure that the stack number of tokens is not updated, and the stack
@@ -450,9 +473,29 @@ pub async fn confidential_completions_handler(
             Ok(response)
         }
         Err(e) => {
-            TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS
-                .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
-            TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+            match e.status_code() {
+                StatusCode::TOO_MANY_REQUESTS => {
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::BAD_REQUEST => {
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::LOCKED => {
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::TOO_EARLY => {
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                StatusCode::UNAUTHORIZED => {
+                    TOTAL_UNAUTHORIZED_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+                _ => {
+                    TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                }
+            }
             if let Some(stack_small_id) = stack_small_id {
                 // NOTE: We need to update the stack number of tokens as the service failed to generate
                 // a proper response. For this reason, we set the total number of tokens to 0.
diff --git a/atoma-service/src/handlers/embeddings.rs b/atoma-service/src/handlers/embeddings.rs
index 31ce2a8f..7f3ba54d 100644
--- a/atoma-service/src/handlers/embeddings.rs
+++ b/atoma-service/src/handlers/embeddings.rs
@@ -7,9 +7,10 @@ use crate::{
         handle_confidential_compute_encryption_response,
         metrics::{
             TEXT_EMBEDDINGS_CONFIDENTIAL_NUM_REQUESTS, TEXT_EMBEDDINGS_LATENCY_METRICS,
-            TEXT_EMBEDDINGS_NUM_REQUESTS, TOTAL_COMPLETED_REQUESTS, TOTAL_FAILED_REQUESTS,
-            TOTAL_FAILED_TEXT_EMBEDDING_CONFIDENTIAL_REQUESTS,
-            TOTAL_FAILED_TEXT_EMBEDDING_REQUESTS,
+            TEXT_EMBEDDINGS_NUM_REQUESTS, TOTAL_BAD_REQUESTS, TOTAL_COMPLETED_REQUESTS,
+            TOTAL_FAILED_REQUESTS, TOTAL_FAILED_TEXT_EMBEDDING_CONFIDENTIAL_REQUESTS,
+            TOTAL_FAILED_TEXT_EMBEDDING_REQUESTS, TOTAL_LOCKED_REQUESTS, TOTAL_TOO_EARLY_REQUESTS,
+            TOTAL_TOO_MANY_REQUESTS, TOTAL_UNAUTHORIZED_REQUESTS,
         },
         sign_response_and_update_stack_hash, update_fiat_amount, update_stack_num_compute_units,
     },
@@ -18,6 +19,7 @@ use crate::{
     types::{ConfidentialComputeRequest, ConfidentialComputeResponse},
 };
 use axum::{extract::State, Extension, Json};
+use hyper::StatusCode;
 use opentelemetry::KeyValue;
 use reqwest::Client;
 use serde_json::Value;
@@ -138,9 +140,28 @@ pub async fn embeddings_handler(
             Ok(response)
         }
         Err(e) => {
-            TOTAL_FAILED_TEXT_EMBEDDING_REQUESTS
-                .add(1, &[KeyValue::new("model", model.as_str().to_owned())]);
-            TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new("model", model.as_str().to_owned())]);
+            match e.status_code() {
+                StatusCode::TOO_MANY_REQUESTS => {
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::BAD_REQUEST => {
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::LOCKED => {
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::TOO_EARLY => {
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::UNAUTHORIZED => {
+                    TOTAL_UNAUTHORIZED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                _ => {
+                    TOTAL_FAILED_TEXT_EMBEDDING_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+            }
             if let Some(stack_small_id) = stack_small_id {
                 let concurrent_requests = handle_concurrent_requests_count_decrement(
                     &state.concurrent_requests_per_stack,
@@ -312,9 +333,28 @@ pub async fn confidential_embeddings_handler(
             Ok(response)
         }
         Err(e) => {
-            TOTAL_FAILED_TEXT_EMBEDDING_CONFIDENTIAL_REQUESTS
-                .add(1, &[KeyValue::new("model", model.as_str().to_owned())]);
-            TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new("model", model.as_str().to_owned())]);
+            match e.status_code() {
+                StatusCode::TOO_MANY_REQUESTS => {
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::BAD_REQUEST => {
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::LOCKED => {
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::TOO_EARLY => {
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::UNAUTHORIZED => {
+                    TOTAL_UNAUTHORIZED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                _ => {
+                    TOTAL_FAILED_TEXT_EMBEDDING_CONFIDENTIAL_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+            }
             if let Some(stack_small_id) = stack_small_id {
                 let concurrent_requests = handle_concurrent_requests_count_decrement(
                     &state.concurrent_requests_per_stack,
diff --git a/atoma-service/src/handlers/image_generations.rs b/atoma-service/src/handlers/image_generations.rs
index 12b2251a..b1fb79c4 100644
--- a/atoma-service/src/handlers/image_generations.rs
+++ b/atoma-service/src/handlers/image_generations.rs
@@ -6,8 +6,10 @@ use crate::{
         handle_concurrent_requests_count_decrement,
         metrics::{
             IMAGE_GEN_CONFIDENTIAL_NUM_REQUESTS, IMAGE_GEN_LATENCY_METRICS, IMAGE_GEN_NUM_REQUESTS,
-            TOTAL_COMPLETED_REQUESTS, TOTAL_FAILED_IMAGE_CONFIDENTIAL_GENERATION_REQUESTS,
-            TOTAL_FAILED_IMAGE_GENERATION_REQUESTS, TOTAL_FAILED_REQUESTS,
+            TOTAL_BAD_REQUESTS, TOTAL_COMPLETED_REQUESTS,
+            TOTAL_FAILED_IMAGE_CONFIDENTIAL_GENERATION_REQUESTS,
+            TOTAL_FAILED_IMAGE_GENERATION_REQUESTS, TOTAL_FAILED_REQUESTS, TOTAL_LOCKED_REQUESTS,
+            TOTAL_TOO_EARLY_REQUESTS, TOTAL_TOO_MANY_REQUESTS, TOTAL_UNAUTHORIZED_REQUESTS,
         },
         update_fiat_amount, update_stack_num_compute_units,
     },
@@ -16,6 +18,7 @@ use crate::{
     types::{ConfidentialComputeRequest, ConfidentialComputeResponse},
 };
 use axum::{extract::State, Extension, Json};
+use hyper::StatusCode;
 use opentelemetry::KeyValue;
 use reqwest::Client;
 use serde_json::Value;
@@ -118,28 +121,48 @@ pub async fn image_generations_handler(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or("unknown");
+        .unwrap_or("unknown")
+        .to_string();
 
     match handle_image_generations_response(
         &state,
-        payload.clone(),
+        payload,
         payload_hash,
         stack_small_id,
         client_encryption_metadata,
         &endpoint,
         timer,
-        model.to_string(),
+        model.clone(),
     )
     .await
     {
         Ok(response) => {
-            TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new("model", model.to_owned())]);
+            TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
             Ok(response)
         }
         Err(e) => {
-            TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new("model", model.to_owned())]);
-            TOTAL_FAILED_IMAGE_GENERATION_REQUESTS
-                .add(1, &[KeyValue::new("model", model.to_owned())]);
+            match e.status_code() {
+                StatusCode::TOO_MANY_REQUESTS => {
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::BAD_REQUEST => {
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::LOCKED => {
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::TOO_EARLY => {
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::UNAUTHORIZED => {
+                    TOTAL_UNAUTHORIZED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                _ => {
+                    TOTAL_FAILED_IMAGE_GENERATION_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+            }
             if let Some(stack_small_id) = stack_small_id {
                 let concurrent_requests = handle_concurrent_requests_count_decrement(
                     &state.concurrent_requests_per_stack,
@@ -263,13 +286,13 @@ pub async fn confidential_image_generations_handler(
 
     match handle_image_generations_response(
         &state,
-        payload.clone(),
+        payload,
         payload_hash,
         stack_small_id,
         client_encryption_metadata,
         &endpoint,
         timer,
-        model.to_string(),
+        model.clone(),
     )
     .await
     {
@@ -306,9 +329,28 @@ pub async fn confidential_image_generations_handler(
             Ok(response)
         }
         Err(e) => {
-            TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new("model", model.clone())]);
-            TOTAL_FAILED_IMAGE_CONFIDENTIAL_GENERATION_REQUESTS
-                .add(1, &[KeyValue::new("model", model.clone())]);
+            match e.status_code() {
+                StatusCode::TOO_MANY_REQUESTS => {
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::BAD_REQUEST => {
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::LOCKED => {
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::TOO_EARLY => {
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                StatusCode::UNAUTHORIZED => {
+                    TOTAL_UNAUTHORIZED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+                _ => {
+                    TOTAL_FAILED_IMAGE_CONFIDENTIAL_GENERATION_REQUESTS
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                }
+            }
             if let Some(stack_small_id) = stack_small_id {
                 let concurrent_requests = handle_concurrent_requests_count_decrement(
                     &state.concurrent_requests_per_stack,
diff --git a/atoma-service/src/handlers/metrics.rs b/atoma-service/src/handlers/metrics.rs
index 4497f291..7515e0eb 100644
--- a/atoma-service/src/handlers/metrics.rs
+++ b/atoma-service/src/handlers/metrics.rs
@@ -346,6 +346,81 @@ pub static TOTAL_FAILED_CHAT_REQUESTS: LazyLock<Counter<u64>> = LazyLock::new(||
         .build()
 });
 
+/// Counter metric that tracks the total number of too many requests.
+///
+/// # Metric Details
+/// - Name: `atoma_total_too_many_requests`
+/// - Type: Counter
+/// - Labels: `model`
+/// - Unit: requests (count)
+pub static TOTAL_TOO_MANY_REQUESTS: LazyLock<Counter<u64>> = LazyLock::new(|| {
+    GLOBAL_METER
+        .u64_counter("atoma_total_too_many_requests")
+        .with_description("Total number of too many requests")
+        .with_unit("requests")
+        .build()
+});
+
+/// Counter metric that tracks the total number of unauthorized requests.
+///
+/// # Metric Details
+/// - Name: `atoma_total_unauthorized_requests`
+/// - Type: Counter
+/// - Labels: `model`
+/// - Unit: requests (count)
+pub static TOTAL_UNAUTHORIZED_REQUESTS: LazyLock<Counter<u64>> = LazyLock::new(|| {
+    GLOBAL_METER
+        .u64_counter("atoma_total_unauthorized_requests")
+        .with_description("Total number of unauthorized requests")
+        .with_unit("requests")
+        .build()
+});
+
+/// Counter metric that tracks the total number of too early requests.
+///
+/// # Metric Details
+/// - Name: `atoma_total_too_early_requests`
+/// - Type: Counter
+/// - Labels: `model`
+/// - Unit: requests (count)
+pub static TOTAL_TOO_EARLY_REQUESTS: LazyLock<Counter<u64>> = LazyLock::new(|| {
+    GLOBAL_METER
+        .u64_counter("atoma_total_too_early_requests")
+        .with_description("Total number of too early requests")
+        .with_unit("requests")
+        .build()
+});
+
+/// Counter metric that tracks the total number of locked requests.
+///
+/// # Metric Details
+/// - Name: `atoma_total_locked_requests`
+/// - Type: Counter
+/// - Labels: `model`
+/// - Unit: requests (count)
+pub static TOTAL_LOCKED_REQUESTS: LazyLock<Counter<u64>> = LazyLock::new(|| {
+    GLOBAL_METER
+        .u64_counter("atoma_total_locked_requests")
+        .with_description("Total number of locked requests")
+        .with_unit("requests")
+        .build()
+});
+
+/// Counter metric that tracks the total number of bad request requests.
+///
+/// # Metric Details
+/// - Name: `atoma_TOTAL_BAD_REQUESTS`
+/// - Type: Counter
+/// - Labels: `model`
+/// - Unit: requests (count)
+pub static TOTAL_BAD_REQUESTS: LazyLock<Counter<u64>> = LazyLock::new(|| {
+    GLOBAL_METER
+        .u64_counter("atoma_TOTAL_BAD_REQUESTS")
+        .with_description("Total number of bad request requests")
+        .with_unit("requests")
+        .build()
+});
+
 /// Counter metric that tracks the total number of confidential chat requests.
 ///
 /// # Metric Details

From 15e31ba865f9271000aea1484f3b71ec7851ba55 Mon Sep 17 00:00:00 2001
From: Chad Nehemiah <chad.nehemiah94@gmail.com>
Date: Thu, 29 May 2025 04:12:35 -0500
Subject: [PATCH 02/13] ci: use stable toolchain (#645)

* ci: use stable toolchain

* chore: fix clippy issues
---
 .github/workflows/ci.yml                |  2 +-
 .github/workflows/coverage.yml          |  2 +-
 Dockerfile                              |  4 ++--
 atoma-daemon/src/components/openapi.rs  |  2 +-
 atoma-p2p-tester/Dockerfile             |  4 ++--
 atoma-p2p/src/service.rs                |  9 ++++++---
 atoma-p2p/src/tests.rs                  | 17 ++++++++++-------
 atoma-p2p/src/types.rs                  | 22 +++++++++++-----------
 atoma-p2p/src/utils.rs                  | 16 +++++++++-------
 atoma-service/src/components/openapi.rs |  2 +-
 atoma-service/src/streamer.rs           |  1 +
 11 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 42d72a81..9f044761 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,7 +10,7 @@ name: CI
   merge_group:
 
 env:
-  toolchain: nightly-2024-11-14
+  toolchain: stable
   CARGO_HTTP_MULTIPLEXING: false
   CARGO_TERM_COLOR: always
   CARGO_UNSTABLE_SPARSE_REGISTRY: true
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index ee37ddd9..089ba8f3 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -3,7 +3,7 @@ name: Coverage
 on: [push, pull_request]
 
 env:
-  toolchain: nightly-2024-11-14
+  toolchain: stable
   CARGO_HTTP_MULTIPLEXING: false
   CARGO_TERM_COLOR: always
   CARGO_UNSTABLE_SPARSE_REGISTRY: true
diff --git a/Dockerfile b/Dockerfile
index cecda550..6aea2fb9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,8 +22,8 @@ RUN echo "* soft nofile 65535" >> /etc/security/limits.conf && \
     echo "* soft nproc 65535" >> /etc/security/limits.conf && \
     echo "* hard nproc 65535" >> /etc/security/limits.conf
 
-# Install Rust 1.84.0
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.84.0 \
+# Install Rust 1.87.0
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.87.0 \
     && . "$HOME/.cargo/env"
 
 # Add cargo to PATH
diff --git a/atoma-daemon/src/components/openapi.rs b/atoma-daemon/src/components/openapi.rs
index 2f90dd6d..ab879e02 100644
--- a/atoma-daemon/src/components/openapi.rs
+++ b/atoma-daemon/src/components/openapi.rs
@@ -46,7 +46,7 @@ pub fn openapi_routes() -> Router {
         let spec_path = docs_dir.join("openapi.yml");
         fs::write(&spec_path, spec).expect("Failed to write OpenAPI spec to file");
 
-        println!("OpenAPI spec written to: {spec_path:?}");
+        println!("OpenAPI spec written to: {}", spec_path.display());
     }
 
     Router::new()
diff --git a/atoma-p2p-tester/Dockerfile b/atoma-p2p-tester/Dockerfile
index 24495bc7..63684040 100644
--- a/atoma-p2p-tester/Dockerfile
+++ b/atoma-p2p-tester/Dockerfile
@@ -18,8 +18,8 @@ RUN apt-get update && apt-get install -y \
 	ca-certificates \
 	&& rm -rf /var/lib/apt/lists/*
 
-# Install Rust 1.84.0
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.84.0 \
+# Install Rust 1.87.0
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.87.0 \
 	&& . "$HOME/.cargo/env"
 
 # Add cargo to PATH
diff --git a/atoma-p2p/src/service.rs b/atoma-p2p/src/service.rs
index 64a43e25..5117d8be 100644
--- a/atoma-p2p/src/service.rs
+++ b/atoma-p2p/src/service.rs
@@ -1029,7 +1029,8 @@ impl AtomaP2pNode {
             return Ok(());
         }
         // Directly deserialize SignedNodeMessage using new method
-        let signed_node_message = SignedNodeMessage::deserialize_with_signature(&message_data)?;
+        let signed_node_message =
+            SignedNodeMessage::deserialize_with_signature(&message_data).map_err(|e| *e)?;
         let signature_len = signed_node_message.signature.len();
         trace!(
             target = "atoma-p2p",
@@ -1058,7 +1059,7 @@ impl AtomaP2pNode {
                 );
                 // NOTE: We should reject the message if it fails to validate
                 // as it means the node is not being following the current protocol
-                if let AtomaP2pNodeError::UrlParseError(_) = e {
+                if let AtomaP2pNodeError::UrlParseError(_) = *e {
                     // We remove the peer from the gossipsub topic, because it is not a valid URL and therefore cannot be reached
                     // by clients for processing OpenAI api compatible AI requests, so these peers are not useful for the network
                     self.swarm
@@ -1192,7 +1193,9 @@ impl AtomaP2pNode {
             node_message,
             signature: Bytes::copy_from_slice(signature.as_ref()),
         };
-        let serialized_signed_node_message = signed_node_message.serialize_with_signature()?;
+        let serialized_signed_node_message = signed_node_message
+            .serialize_with_signature()
+            .map_err(|e| *e)?;
         let topic = gossipsub::IdentTopic::new(METRICS_GOSPUBSUB_TOPIC);
         self.swarm
             .behaviour_mut()
diff --git a/atoma-p2p/src/tests.rs b/atoma-p2p/src/tests.rs
index 7f32cebc..0d3e24f2 100644
--- a/atoma-p2p/src/tests.rs
+++ b/atoma-p2p/src/tests.rs
@@ -161,7 +161,10 @@ async fn test_validate_usage_metrics_message_invalid_url() {
         &tx,
     )
     .await;
-    assert!(matches!(result, Err(AtomaP2pNodeError::UrlParseError(_))));
+    assert!(matches!(
+        result,
+        Err(e) if matches!(*e, AtomaP2pNodeError::UrlParseError(_))
+    ));
 }
 
 #[tokio::test]
@@ -185,7 +188,7 @@ async fn test_validate_usage_metrics_message_expired_timestamp() {
     .await;
     assert!(matches!(
         result,
-        Err(AtomaP2pNodeError::InvalidPublicAddressError(_))
+        Err(e) if matches!(*e, AtomaP2pNodeError::InvalidPublicAddressError(_))
     ));
 }
 
@@ -210,7 +213,7 @@ async fn test_validate_usage_metrics_message_future_timestamp() {
     .await;
     assert!(matches!(
         result,
-        Err(AtomaP2pNodeError::InvalidPublicAddressError(_))
+        Err(e) if matches!(*e, AtomaP2pNodeError::InvalidPublicAddressError(_))
     ));
 }
 
@@ -244,7 +247,7 @@ async fn test_validate_usage_metrics_message_invalid_signature() {
     .await;
     assert!(matches!(
         result,
-        Err(AtomaP2pNodeError::SignatureVerificationError(_))
+        Err(e) if matches!(*e, AtomaP2pNodeError::SignatureVerificationError(_))
     ));
 }
 
@@ -283,7 +286,7 @@ async fn test_validate_usage_metrics_message_invalid_node_ownership() {
 
     assert!(matches!(
         result,
-        Err(AtomaP2pNodeError::NodeSmallIdOwnershipVerificationError(_))
+        Err(e) if matches!(*e, AtomaP2pNodeError::NodeSmallIdOwnershipVerificationError(_))
     ));
 }
 
@@ -312,7 +315,7 @@ async fn test_validate_usage_metrics_message_state_manager_error() {
     .await;
     assert!(matches!(
         result,
-        Err(AtomaP2pNodeError::StateManagerError(_))
+        Err(e) if matches!(*e, AtomaP2pNodeError::StateManagerError(_))
     ));
 }
 
@@ -352,6 +355,6 @@ async fn test_validate_usage_metrics_message_response_channel_error() {
     .await;
     assert!(matches!(
         result,
-        Err(AtomaP2pNodeError::NodeSmallIdOwnershipVerificationError(_))
+        Err(e) if matches!(*e, AtomaP2pNodeError::NodeSmallIdOwnershipVerificationError(_))
     ));
 }
diff --git a/atoma-p2p/src/types.rs b/atoma-p2p/src/types.rs
index 99d3d81b..31a95310 100644
--- a/atoma-p2p/src/types.rs
+++ b/atoma-p2p/src/types.rs
@@ -17,7 +17,7 @@ pub const SECP256K1_SIGNATURE_LENGTH: usize = 98;
 /// see <https://github.com/MystenLabs/sui/blob/main/crates/sui-types/src/crypto.rs#L891>
 pub const SECP256R1_SIGNATURE_LENGTH: usize = 98;
 
-type Result<T, E = AtomaP2pNodeError> = std::result::Result<T, E>;
+type Result<T, E = Box<AtomaP2pNodeError>> = std::result::Result<T, E>;
 
 /// An enum representing different types of events that can be emitted by the Atoma P2P node.
 pub enum AtomaP2pEvent {
@@ -138,8 +138,7 @@ pub struct NodeMessage {
 impl SerializeWithHash for NodeMessage {
     fn serialize_with_hash(&self) -> Result<SerializedMessage> {
         let mut buffer = BytesMut::new();
-        ciborium::into_writer(self, (&mut buffer).writer())
-            .map_err(AtomaP2pNodeError::UsageMetricsSerializeError)?;
+        ciborium::into_writer(self, (&mut buffer).writer()).map_err(|e| Box::new(e.into()))?;
         Ok(SerializedMessage {
             hash: blake3::hash(buffer.as_ref()),
             message: buffer.freeze(),
@@ -185,35 +184,36 @@ pub trait SerializeWithSignature {
 }
 
 impl SerializeWithSignature for SignedNodeMessage {
-    fn serialize_with_signature(&self) -> Result<Bytes, AtomaP2pNodeError> {
+    fn serialize_with_signature(&self) -> Result<Bytes> {
         let mut buffer = BytesMut::with_capacity(1024);
         buffer.extend_from_slice(&self.signature);
 
         // Serialize node message
         ciborium::into_writer(&self.node_message, (&mut buffer).writer())
-            .map_err(AtomaP2pNodeError::UsageMetricsSerializeError)?;
+            .map_err(|e| Box::new(e.into()))?;
 
         Ok(buffer.freeze())
     }
 
-    fn deserialize_with_signature(data: &[u8]) -> Result<Self, AtomaP2pNodeError> {
+    fn deserialize_with_signature(data: &[u8]) -> Result<Self> {
         let signature_len = data
             .first()
             .map(|&flag| match flag {
                 f if f == Ed25519SuiSignature::SCHEME.flag() => Ok(ED25519_SIGNATURE_LENGTH),
                 f if f == Secp256k1SuiSignature::SCHEME.flag() => Ok(SECP256K1_SIGNATURE_LENGTH),
                 f if f == Secp256r1SuiSignature::SCHEME.flag() => Ok(SECP256R1_SIGNATURE_LENGTH),
-                f => Err(AtomaP2pNodeError::SignatureParseError(format!(
+                f => Err(Box::new(AtomaP2pNodeError::SignatureParseError(format!(
                     "Invalid signature scheme, expected 0x00, 0x01 or 0x02, received {f:#04x}",
-                ))),
+                )))),
             })
             .ok_or_else(|| {
-                AtomaP2pNodeError::SignatureParseError(
+                Box::new(AtomaP2pNodeError::SignatureParseError(
                     "Invalid signature scheme: the data is empty".to_string(),
-                )
+                ))
             })??;
         let signature = Bytes::copy_from_slice(&data[0..signature_len]);
-        let node_message = ciborium::from_reader(&data[signature_len..])?;
+        let node_message =
+            ciborium::from_reader(&data[signature_len..]).map_err(|e| Box::new(e.into()))?;
         Ok(Self {
             node_message,
             signature,
diff --git a/atoma-p2p/src/utils.rs b/atoma-p2p/src/utils.rs
index 21774be5..1c726462 100644
--- a/atoma-p2p/src/utils.rs
+++ b/atoma-p2p/src/utils.rs
@@ -75,7 +75,7 @@ const EXPIRED_TIMESTAMP_THRESHOLD: u64 = 10 * 60; // 10 minutes
 #[instrument(level = "debug", skip_all)]
 pub fn validate_node_message_country_url_timestamp(
     node_message: &NodeMessage,
-) -> Result<(), AtomaP2pNodeError> {
+) -> Result<(), Box<AtomaP2pNodeError>> {
     let now = std::time::Instant::now().elapsed().as_secs();
 
     let country = node_message.node_metadata.country.as_str();
@@ -91,7 +91,7 @@ pub fn validate_node_message_country_url_timestamp(
                 "Invalid URL format, received address: {}",
                 node_message.node_metadata.node_public_url
             );
-            AtomaP2pNodeError::UrlParseError(e)
+            Box::new(AtomaP2pNodeError::UrlParseError(e))
         })?;
 
     // Check if the timestamp is within a reasonable time frame
@@ -105,18 +105,20 @@ pub fn validate_node_message_country_url_timestamp(
             node_message.node_metadata.timestamp,
             now
         );
-        return Err(AtomaP2pNodeError::InvalidPublicAddressError(
+        return Err(Box::new(AtomaP2pNodeError::InvalidPublicAddressError(
             "Timestamp is too far in the past".to_string(),
-        ));
+        )));
     }
 
     Ok(())
 }
 
 /// Custom validation function for ISO 3166-1 alpha-2 country codes
-fn validate_country_code(code: &str) -> Result<(), AtomaP2pNodeError> {
+fn validate_country_code(code: &str) -> Result<(), Box<AtomaP2pNodeError>> {
     isocountry::CountryCode::for_alpha2(code).map_err(|_| {
-        AtomaP2pNodeError::InvalidCountryCodeError("Country code is invalid.".to_string())
+        Box::new(AtomaP2pNodeError::InvalidCountryCodeError(
+            "Country code is invalid.".to_string(),
+        ))
     })?;
     Ok(())
 }
@@ -331,7 +333,7 @@ pub async fn validate_signed_node_message(
     node_message_hash: &[u8; 32],
     signature: &[u8],
     state_manager_sender: &Sender<StateManagerEvent>,
-) -> Result<(), AtomaP2pNodeError> {
+) -> Result<(), Box<AtomaP2pNodeError>> {
     // Validate the message's node public URL and timestamp
     validate_node_message_country_url_timestamp(node_message)?;
     // Verify the signature of the message
diff --git a/atoma-service/src/components/openapi.rs b/atoma-service/src/components/openapi.rs
index c963eb0a..8887dfd6 100644
--- a/atoma-service/src/components/openapi.rs
+++ b/atoma-service/src/components/openapi.rs
@@ -62,7 +62,7 @@ pub fn openapi_routes() -> Router {
         let spec_path = docs_dir.join("openapi.yml");
         fs::write(&spec_path, spec).expect("Failed to write OpenAPI spec to file");
 
-        println!("OpenAPI spec written to: {:?}", spec_path);
+        println!("OpenAPI spec written to: {}", spec_path.display());
     }
 
     Router::new()
diff --git a/atoma-service/src/streamer.rs b/atoma-service/src/streamer.rs
index abe26f78..9ded19e1 100644
--- a/atoma-service/src/streamer.rs
+++ b/atoma-service/src/streamer.rs
@@ -134,6 +134,7 @@ pub struct Streamer {
     /// kills the connection before the final chunk is sent. If, instead,
     /// the last chunk is handled, the value is updated to the actual number of tokens
     /// returned by the LLM inference service
+    #[allow(clippy::struct_field_names)]
     streamer_computed_num_tokens: i64,
     /// The number of input tokens for the request
     num_input_tokens: i64,

From 175854148a80c40f615e9eeaa45a01e152adeb24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= <matroid@outlook.com>
Date: Fri, 30 May 2025 14:40:12 +0100
Subject: [PATCH 03/13] revert to use prometheus for queued requests (#646)

* revert to use prometheus for queued requests

* add start metrics collector

* update logs
---
 atoma-bin/atoma_node.rs                       |  18 +-
 atoma-service/src/config.rs                   |   8 +-
 .../src/handlers/chat_completions.rs          |  33 +-
 atoma-service/src/handlers/completions.rs     |  34 +-
 atoma-service/src/handlers/mod.rs             | 478 +++++++++++++++++-
 atoma-service/src/handlers/request_counter.rs |  73 ---
 atoma-service/src/server.rs                   |   6 +-
 atoma-service/src/streamer.rs                 |  12 -
 atoma-service/src/tests.rs                    |   3 +-
 config.example.toml                           |   8 -
 10 files changed, 491 insertions(+), 182 deletions(-)
 delete mode 100644 atoma-service/src/handlers/request_counter.rs

diff --git a/atoma-bin/atoma_node.rs b/atoma-bin/atoma_node.rs
index b8103256..6359881d 100644
--- a/atoma-bin/atoma_node.rs
+++ b/atoma-bin/atoma_node.rs
@@ -4,9 +4,7 @@ use anyhow::{Context, Result};
 use atoma_confidential::AtomaConfidentialCompute;
 use atoma_daemon::{telemetry, AtomaDaemonConfig, DaemonState};
 use atoma_p2p::{AtomaP2pNode, AtomaP2pNodeConfig};
-use atoma_service::{
-    config::AtomaServiceConfig, handlers::request_counter::RequestCounter, server::AppState,
-};
+use atoma_service::{config::AtomaServiceConfig, server::AppState};
 use atoma_state::{config::AtomaStateManagerConfig, AtomaState, AtomaStateManager};
 use atoma_sui::{client::Client, config::Config, subscriber::Subscriber};
 use atoma_utils::spawn_with_shutdown;
@@ -373,9 +371,21 @@ async fn main() -> Result<()> {
         keystore: Arc::new(keystore),
         address_index,
         whitelist_sui_addresses_for_fiat: config.service.whitelist_sui_addresses_for_fiat,
-        running_num_requests: Arc::new(RequestCounter::new()),
     };
 
+    let chat_completions_service_urls = app_state
+        .chat_completions_service_urls
+        .iter()
+        .flat_map(|(model, urls)| {
+            urls.iter()
+                .map(|(url, job)| (model.clone(), url.clone(), job.clone()))
+        })
+        .collect();
+    atoma_service::handlers::inference_service_metrics::start_metrics_updater(
+        chat_completions_service_urls,
+        config.service.metrics_update_interval,
+    );
+
     let daemon_app_state = DaemonState {
         atoma_state: AtomaState::new_from_url(&config.state.database_url).await?,
         client,
diff --git a/atoma-service/src/config.rs b/atoma-service/src/config.rs
index 9538fb49..f7544506 100644
--- a/atoma-service/src/config.rs
+++ b/atoma-service/src/config.rs
@@ -9,12 +9,10 @@ use serde::Deserialize;
 /// including URLs for various services and a list of models.
 #[derive(Debug, Deserialize)]
 pub struct AtomaServiceConfig {
-    /// URL for the chat completions service with maximum concurrency settings.
+    /// URL for the chat completions service.
     ///
-    /// This is an optional field that, if provided, specifies the endpoint
-    /// for the chat completions service used by the Atoma Service, together with its
-    /// associated Prometheus job name.
-    pub chat_completions_service_urls: HashMap<String, Vec<(String, String, usize)>>,
+    /// This field specifies the endpoint for the chat completions service used by the Atoma Service.
+    pub chat_completions_service_urls: HashMap<String, Vec<(String, String)>>,
 
     /// URL for the embeddings service.
     ///
diff --git a/atoma-service/src/handlers/chat_completions.rs b/atoma-service/src/handlers/chat_completions.rs
index 7a2815d0..89ec218f 100644
--- a/atoma-service/src/handlers/chat_completions.rs
+++ b/atoma-service/src/handlers/chat_completions.rs
@@ -49,10 +49,7 @@ use tracing::{debug, info, instrument};
 use utoipa::OpenApi;
 
 use serde::Deserialize;
-use std::{
-    sync::Arc,
-    time::{Duration, Instant},
-};
+use std::time::{Duration, Instant};
 
 use crate::{
     error::AtomaServiceError,
@@ -904,16 +901,12 @@ async fn handle_streaming_response(
             }
         })?;
     let (chat_completions_service_url, status_code) =
-        get_best_available_chat_completions_service_url(
-            &state.running_num_requests,
-            chat_completions_service_urls,
-            model,
-        )
-        .await
-        .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
-            message: e.to_string(),
-            endpoint: endpoint.clone(),
-        })?;
+        get_best_available_chat_completions_service_url(chat_completions_service_urls, model)
+            .await
+            .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
+                message: e.to_string(),
+                endpoint: endpoint.clone(),
+            })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
@@ -931,9 +924,6 @@ async fn handle_streaming_response(
         .send()
         .await
         .map_err(|e| {
-            state
-                .running_num_requests
-                .decrement(&chat_completions_service_url);
             AtomaServiceError::InternalError {
                 message: format!(
                     "Error sending request to inference service, for request with payload hash: {:?}, and stack small id: {:?}, with error: {}",
@@ -946,9 +936,6 @@ async fn handle_streaming_response(
         })?;
 
     if !response.status().is_success() {
-        state
-            .running_num_requests
-            .decrement(&chat_completions_service_url);
         let status = response.status();
         let bytes = response
             .bytes()
@@ -1002,8 +989,6 @@ async fn handle_streaming_response(
         price_per_one_million_compute_units,
         user_id,
         user_address,
-        Arc::clone(&state.running_num_requests),
-        chat_completions_service_url,
     ))
     .keep_alive(
         axum::response::sse::KeepAlive::new()
@@ -1335,7 +1320,6 @@ pub mod utils {
             })?;
         let (chat_completions_service_url, status_code) =
             get_best_available_chat_completions_service_url(
-                &state.running_num_requests,
                 chat_completions_service_url_services,
                 model,
             )
@@ -1358,9 +1342,6 @@ pub mod utils {
             .json(&payload)
             .send()
             .await;
-        state
-            .running_num_requests
-            .decrement(&chat_completions_service_url);
         let response = response.map_err(|e| {
             AtomaServiceError::InternalError {
                 message: format!(
diff --git a/atoma-service/src/handlers/completions.rs b/atoma-service/src/handlers/completions.rs
index 268d4e4e..e3bbc3b5 100644
--- a/atoma-service/src/handlers/completions.rs
+++ b/atoma-service/src/handlers/completions.rs
@@ -37,10 +37,7 @@ use tracing::{debug, info, instrument};
 use utoipa::OpenApi;
 
 use serde::Deserialize;
-use std::{
-    sync::Arc,
-    time::{Duration, Instant},
-};
+use std::time::{Duration, Instant};
 
 use crate::{
     error::AtomaServiceError,
@@ -877,16 +874,13 @@ async fn handle_streaming_response(
                 endpoint: endpoint.clone(),
             }
         })?;
-    let (completions_service_url, status_code) = get_best_available_chat_completions_service_url(
-        &state.running_num_requests,
-        chat_completions_service_urls,
-        model,
-    )
-    .await
-    .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
-        message: e.to_string(),
-        endpoint: endpoint.clone(),
-    })?;
+    let (completions_service_url, status_code) =
+        get_best_available_chat_completions_service_url(chat_completions_service_urls, model)
+            .await
+            .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
+                message: e.to_string(),
+                endpoint: endpoint.clone(),
+            })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
@@ -901,9 +895,6 @@ async fn handle_streaming_response(
         .send()
         .await
         .map_err(|e| {
-            state
-                .running_num_requests
-                .decrement(&completions_service_url);
             AtomaServiceError::InternalError {
                 message: format!(
                     "Error sending request to inference service, for request with payload hash: {:?}, and stack small id: {:?}, with error: {}",
@@ -916,9 +907,6 @@ async fn handle_streaming_response(
     })?;
 
     if !response.status().is_success() {
-        state
-            .running_num_requests
-            .decrement(&completions_service_url);
         let status = response.status();
         let bytes = response
             .bytes()
@@ -971,8 +959,6 @@ async fn handle_streaming_response(
         price_per_one_million_tokens,
         user_id,
         user_address,
-        Arc::clone(&state.running_num_requests),
-        completions_service_url,
     ))
     .keep_alive(
         axum::response::sse::KeepAlive::new()
@@ -1297,7 +1283,6 @@ pub mod utils {
             })?;
         let (completions_service_url, status_code) =
             get_best_available_chat_completions_service_url(
-                &state.running_num_requests,
                 completions_service_url_services,
                 model,
             )
@@ -1317,9 +1302,6 @@ pub mod utils {
             .json(&payload)
             .send()
             .await;
-        state
-            .running_num_requests
-            .decrement(&completions_service_url);
         let response = response
         .map_err(|e| {
             AtomaServiceError::InternalError {
diff --git a/atoma-service/src/handlers/mod.rs b/atoma-service/src/handlers/mod.rs
index 7b86d750..aa79a7e4 100644
--- a/atoma-service/src/handlers/mod.rs
+++ b/atoma-service/src/handlers/mod.rs
@@ -4,7 +4,6 @@ pub mod completions;
 pub mod embeddings;
 pub mod image_generations;
 pub mod metrics;
-pub mod request_counter;
 pub mod request_model;
 pub mod stop_streamer;
 
@@ -572,14 +571,268 @@ pub fn handle_status_code_error(
 }
 
 pub mod inference_service_metrics {
+    use futures::future::join_all;
+    use opentelemetry::KeyValue;
+    use prometheus_parse::Scrape;
+    use prometheus_parse::Value;
+    use rand::Rng;
+    use std::sync::Arc;
+    use std::sync::LazyLock;
+    use std::time::Duration;
+    use tokio::sync::RwLock;
+    use tokio::time;
 
+    use crate::handlers::metrics::CHAT_COMPLETIONS_TOO_MANY_REQUESTS;
     use hyper::StatusCode;
-    use rand::seq::SliceRandom;
-    use tracing::instrument;
+    use tracing::{info, instrument};
 
-    use super::request_counter::RequestCounter;
+    use super::InferenceService;
 
     pub type Result<T> = std::result::Result<T, ChatCompletionsMetricsError>;
+    type MetricValue = ChatCompletionsMetrics;
+    type MetricResult = Result<MetricValue>;
+    type MetricsVec = Vec<MetricResult>;
+    type CachedMetrics = Option<MetricsVec>;
+    type MetricsLock = Arc<RwLock<CachedMetrics>>;
+
+    /// The default interval for updating the metrics
+    const DEFAULT_METRICS_UPDATE_INTERVAL_MILLIS: u64 = 35;
+
+    /// The timeout for the Prometheus metrics queries
+    const METRICS_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(2);
+
+    /// The HTTP client for the metrics queries
+    static HTTP_CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
+        reqwest::Client::builder()
+            .timeout(METRICS_TIMEOUT)
+            .build()
+            .expect("Failed to create HTTP client")
+    });
+
+    /// Chat completions metrics
+    #[derive(Debug, Clone)]
+    struct ChatCompletionsMetrics {
+        /// The model name  
+        model: String,
+        /// The chat completions service url
+        chat_completions_service_url: String,
+        /// The number of queue requests
+        num_queued_requests: f64,
+        /// The number of running requests
+        num_running_requests: f64,
+    }
+
+    /// Cache structure to store metrics
+    #[derive(Debug, Default)]
+    struct MetricsCache {
+        metrics: MetricsLock,
+    }
+
+    impl MetricsCache {
+        fn new() -> Self {
+            Self {
+                metrics: Arc::new(RwLock::new(None)),
+            }
+        }
+
+        async fn get_metrics(&self) -> Option<MetricsVec> {
+            self.metrics.read().await.clone()
+        }
+
+        async fn update_metrics(&self, new_metrics: Vec<Result<ChatCompletionsMetrics>>) {
+            *self.metrics.write().await = Some(new_metrics);
+        }
+    }
+
+    /// Global metrics cache
+    #[allow(clippy::redundant_closure)]
+    static VLLM_METRICS_CACHE: LazyLock<MetricsCache> = LazyLock::new(|| MetricsCache::new());
+
+    /// Global metrics cache
+    #[allow(clippy::redundant_closure)]
+    static SGLANG_METRICS_CACHE: LazyLock<MetricsCache> = LazyLock::new(|| MetricsCache::new());
+
+    /// Start the background task to update metrics every 500 milliseconds
+    ///
+    /// # Arguments
+    ///
+    /// * `chat_completions_service_urls` - A vector of tuples containing the model name, the chat completions service URL and the job name.
+    /// * `metrics_update_interval` - The interval in seconds to update the metrics.
+    #[instrument(level = "info", skip_all)]
+    pub fn start_metrics_updater(
+        chat_completions_service_urls: Vec<(String, String, String)>,
+        metrics_update_interval: Option<u64>,
+    ) {
+        type ChatCompletionsServiceUrls = Vec<(String, String, String)>;
+        info!(
+            target = "atoma-service",
+            module = "inference_service_metrics",
+            level = "info",
+            "Starting metrics updater with {chat_completions_service_urls:?}"
+        );
+        let (vllm_chat_completions_service_urls, sglang_chat_completions_service_urls): (
+            ChatCompletionsServiceUrls,
+            ChatCompletionsServiceUrls,
+        ) = chat_completions_service_urls
+            .iter()
+            .cloned()
+            .partition(|(_, _, job)| job.contains("vllm"));
+        info!(
+            target = "atoma-service",
+            module = "inference_service_metrics",
+            level = "info",
+            "Partitioned chat completions service urls: vllm: {vllm_chat_completions_service_urls:?}, sglang: {sglang_chat_completions_service_urls:?}"
+        );
+        let vllm_chat_completions_service_urls = Arc::new(vllm_chat_completions_service_urls);
+        let sglang_chat_completions_service_urls = Arc::new(sglang_chat_completions_service_urls);
+        tokio::spawn(async move {
+            let metrics_interval =
+                metrics_update_interval.unwrap_or(DEFAULT_METRICS_UPDATE_INTERVAL_MILLIS);
+            info!(
+                target = "atoma-service",
+                module = "inference_service_metrics",
+                level = "info",
+                "Metrics update interval: {metrics_interval} milliseconds"
+            );
+            let mut interval = time::interval(Duration::from_millis(metrics_interval));
+            loop {
+                interval.tick().await;
+                if !vllm_chat_completions_service_urls.is_empty() {
+                    let vllm_metrics =
+                        get_metrics(&InferenceService::Vllm, &vllm_chat_completions_service_urls)
+                            .await;
+                    if vllm_metrics.iter().any(std::result::Result::is_ok) {
+                        VLLM_METRICS_CACHE.update_metrics(vllm_metrics).await;
+                    } else {
+                        tracing::warn!(
+                            "Failed to retrieve any valid vLLM metrics, not updating cache"
+                        );
+                    }
+                }
+                if !sglang_chat_completions_service_urls.is_empty() {
+                    let sglang_metrics = get_metrics(
+                        &InferenceService::SgLang,
+                        &sglang_chat_completions_service_urls,
+                    )
+                    .await;
+                    if sglang_metrics.iter().any(std::result::Result::is_ok) {
+                        SGLANG_METRICS_CACHE.update_metrics(sglang_metrics).await;
+                    } else {
+                        tracing::warn!(
+                            "Failed to retrieve any valid SgLang metrics, not updating cache"
+                        );
+                    }
+                }
+            }
+        });
+    }
+
+    /// Fetches metrics from the specified chat completions service URL.
+    ///
+    /// This function retrieves metrics from the specified chat completions service URL
+    /// and parses the response to extract relevant metrics such as the number of queue
+    /// requests and running requests. It handles errors gracefully and returns a vector
+    /// of results, where each result contains the metrics for a specific service URL.
+    ///
+    /// # Arguments
+    ///
+    /// * `inference_service` - The inference service type (vLLM or SgLang).
+    /// * `jobs_with_url` - A slice of tuples containing model name, the chat completions service URL
+    ///   and the job name (e.g., "vllm-service", "sglang-service").
+    ///
+    /// # Returns
+    ///
+    /// Returns a `Vec<Result<ChatCompletionsMetrics>>`, where each result contains
+    /// the metrics for a specific service URL. If an error occurs while fetching or parsing
+    /// the metrics, the error is returned in the result.
+    ///
+    /// # Errors
+    ///
+    /// *   `ChatCompletionsMetricsError::NoMetricsFound`: If no metrics are found for the
+    ///     specified job or if the metrics response is invalid.
+    /// *   Other variants of `ChatCompletionsMetricsError` may be returned if underlying
+    ///     issues occur during metric collection from Prometheus (e.g., network errors,
+    ///     parsing errors), though the function attempts to handle missing individual metrics
+    ///     gracefully.
+    async fn get_metrics(
+        inference_service: &InferenceService,
+        jobs_with_url: &[(String, String, String)], // (model, url, job)
+    ) -> Vec<Result<ChatCompletionsMetrics>> {
+        let tasks =
+            jobs_with_url
+                .iter()
+                .map(|(model, chat_completions_service_url, job)| async move {
+                    let response = HTTP_CLIENT
+                        .get(format!("{chat_completions_service_url}/metrics"))
+                        .send()
+                        .await
+                        .map_err(|_| {
+                            ChatCompletionsMetricsError::NoMetricsFound(job.to_string())
+                        })?;
+                    let body = response.text().await?;
+                    let lines = body
+                        .lines()
+                        .map(|line| Ok(line.replace(inference_service.get_service_prefix(), "")));
+                    let metrics = Scrape::parse(lines).unwrap();
+                    let num_queued_requests = extract_metric(
+                        &metrics,
+                        inference_service.get_queued_requests_metric_name(),
+                        job,
+                    )?;
+                    let num_running_requests = extract_metric(
+                        &metrics,
+                        inference_service.get_running_requests_metric_name(),
+                        job,
+                    )?;
+
+                    Ok(ChatCompletionsMetrics {
+                        model: model.clone(),
+                        chat_completions_service_url: chat_completions_service_url.clone(),
+                        num_queued_requests,
+                        num_running_requests,
+                    })
+                });
+        join_all(tasks).await
+    }
+
+    /// Extracts a specific metric from the Scrape response.
+    ///
+    /// This function searches for a metric with the specified name in the
+    /// Scrape response and returns its value if found.
+    ///
+    /// # Arguments
+    ///
+    /// * `metrics` - The Scrape response containing the metrics.
+    /// * `name` - The name of the metric to extract.
+    /// * `job` - The job name used for error reporting.
+    ///
+    /// # Returns
+    ///
+    /// Returns a `Result<f64>` containing the metric value if found,
+    /// or an error if not found or if the value is not a Gauge.
+    ///
+    /// # Errors
+    ///
+    /// *   `ChatCompletionsMetricsError::NoMetricsFound`: If the specified metric is not found
+    ///     or if the value is not a Gauge.
+    /// *   Other variants of `ChatCompletionsMetricsError` may be returned if underlying
+    ///     issues occur during metric collection from Prometheus (e.g., network errors,
+    ///     parsing errors),
+    ///     though the function attempts to handle missing individual metrics gracefully.
+    fn extract_metric(metrics: &Scrape, name: &str, job: &str) -> Result<f64> {
+        metrics
+            .samples
+            .iter()
+            .find(|s| s.metric == name)
+            .ok_or_else(|| ChatCompletionsMetricsError::NoMetricsFound(job.to_string()))
+            .and_then(|sample| {
+                if let Value::Gauge(value) = sample.value {
+                    Ok(value)
+                } else {
+                    Err(ChatCompletionsMetricsError::NoMetricsFound(job.to_string()))
+                }
+            })
+    }
 
     /// Selects the best available chat completions service URL for a given model based on performance metrics.
     ///
@@ -642,41 +895,224 @@ pub mod inference_service_metrics {
     #[instrument(level = "info", skip_all, fields(model=model))]
     #[allow(clippy::float_cmp)]
     pub async fn get_best_available_chat_completions_service_url(
-        running_num_requests: &RequestCounter,
-        chat_completions_service_urls: &[(String, String, usize)], // (url, job, max_concurrent_requests)
+        chat_completions_service_urls: &[(String, String)],
         model: &str,
     ) -> Result<(String, StatusCode)> {
-        // Ensure there are service URLs to choose from.
+        const MAX_ALLOWED_NUM_QUEUED_REQUESTS: f64 = 1.0; // Default to 1 request
+
+        type ChatCompletionsServiceUrls = Vec<(String, String)>;
+
         if chat_completions_service_urls.is_empty() {
-            tracing::warn!(
-                target = "atoma-service",
-                model = model,
-                "No chat completions service URLs provided for model."
-            );
             return Err(
                 ChatCompletionsMetricsError::NoChatCompletionsServiceUrlsFound(model.to_string()),
             );
         }
-        let mut shuffled_chat_completions_service_urls = chat_completions_service_urls.to_vec();
-        shuffled_chat_completions_service_urls.shuffle(&mut rand::thread_rng());
-        for (url_str, _job_name, max_concurrent_val) in &shuffled_chat_completions_service_urls {
-            if running_num_requests.increment(url_str, *max_concurrent_val) {
-                return Ok((url_str.clone(), StatusCode::OK));
+        tracing::debug!(
+            target = "atoma-service",
+            module = "inference_service_metrics",
+            level = "info",
+            "Getting best available chat completions service URL for model: {model} and urls: {chat_completions_service_urls:?}"
+        );
+        let (vllm_chat_completions_service_urls, sglang_chat_completions_service_urls): (
+            ChatCompletionsServiceUrls,
+            ChatCompletionsServiceUrls,
+        ) = chat_completions_service_urls
+            .iter()
+            .cloned()
+            .partition(|(_, job)| job.contains("vllm"));
+
+        tracing::debug!(
+            target = "atoma-service",
+            module = "inference_service_metrics",
+            level = "info",
+            "Partitioned chat completions service urls: vllm: {vllm_chat_completions_service_urls:?}, sglang: {sglang_chat_completions_service_urls:?}"
+        );
+
+        // Get cached metrics
+        let vllm_metrics = if vllm_chat_completions_service_urls.is_empty() {
+            vec![]
+        } else if let Some(metrics) = VLLM_METRICS_CACHE.get_metrics().await {
+            metrics
+        } else {
+            info!(
+                target = "atoma-service",
+                module = "inference_service_metrics",
+                level = "info",
+                "No cached vLLM metrics, getting them directly"
+            );
+            let vllm_chat_completions_service_urls_with_model: Vec<(String, String, String)> =
+                vllm_chat_completions_service_urls
+                    .iter()
+                    .map(|(url, job)| (model.to_string(), url.clone(), job.clone()))
+                    .collect();
+            get_metrics(
+                &InferenceService::Vllm,
+                &vllm_chat_completions_service_urls_with_model,
+            )
+            .await
+        };
+        let sglang_metrics = if sglang_chat_completions_service_urls.is_empty() {
+            vec![]
+        } else if let Some(metrics) = SGLANG_METRICS_CACHE.get_metrics().await {
+            metrics
+        } else {
+            info!(
+                target = "atoma-service",
+                module = "inference_service_metrics",
+                level = "info",
+                "No cached SgLang metrics, getting them directly"
+            );
+            let sglang_chat_completions_service_urls_with_model: Vec<(String, String, String)> =
+                sglang_chat_completions_service_urls
+                    .iter()
+                    .map(|(url, job)| (model.to_string(), url.clone(), job.clone()))
+                    .collect();
+            get_metrics(
+                &InferenceService::SgLang,
+                &sglang_chat_completions_service_urls_with_model,
+            )
+            .await
+        };
+
+        tracing::debug!(
+            target = "atoma-service",
+            module = "inference_service_metrics",
+            level = "info",
+            "Received vLLM metrics: {vllm_metrics:?}, SgLang metrics: {sglang_metrics:?}"
+        );
+
+        let mut metrics_results = Vec::new();
+        for metric in vllm_metrics.into_iter().chain(sglang_metrics.into_iter()) {
+            match metric {
+                Ok(ChatCompletionsMetrics {
+                    model: current_model,
+                    chat_completions_service_url,
+                    num_queued_requests,
+                    num_running_requests,
+                }) => {
+                    tracing::info!(
+                        target = "atoma-service",
+                        module = "inference_service_metrics",
+                        level = "info",
+                        "current_model = {current_model}, model = {model}, they are equal = {}",
+                        current_model == model
+                    );
+                    if current_model.to_lowercase() != model.to_lowercase() {
+                        // NOTE: We only want to consider metrics for the current model
+                        continue;
+                    }
+                    info!(
+                        target = "atoma-service",
+                        module = "vllm_metrics",
+                        level = "info",
+                        "Received vLLM/SgLang metrics response for {chat_completions_service_url}:\n
+                            num_queued_requests={num_queued_requests},
+                            num_running_requests={num_running_requests}"
+                    );
+                    metrics_results.push(ChatCompletionsMetrics {
+                        model: current_model,
+                        chat_completions_service_url,
+                        num_queued_requests,
+                        num_running_requests,
+                    });
+                }
+                Err(e) => {
+                    tracing::warn!(
+                        target = "atoma-service",
+                        module = "vllm_metrics",
+                        level = "error",
+                        "Failed to get metrics for chat completions service url with error: {e}",
+                    );
+                }
             }
         }
 
-        tracing::warn!(
+        if metrics_results.is_empty() {
+            tracing::warn!(
+                target = "atoma-service",
+                level = "warn",
+                "No metrics found for model: {model}",
+            );
+            // NOTE: In this case, we pick one of the urls at random
+            let random_index = rand::thread_rng().gen_range(0..chat_completions_service_urls.len());
+            let best_url = chat_completions_service_urls[random_index].0.clone();
+            return Ok((best_url, StatusCode::OK));
+        }
+
+        // Select the best available chat completions service URL based on the number of queued and running requests.
+        let best_metrics = metrics_results
+            .iter()
+            .min_by_key(|metric| {
+                (
+                    metric.num_queued_requests as i64,
+                    metric.num_running_requests as i64,
+                )
+            })
+            .unwrap();
+
+        if best_metrics.num_queued_requests >= MAX_ALLOWED_NUM_QUEUED_REQUESTS {
+            tracing::warn!(
+                target = "atoma-service",
+                level = "warn",
+                "Node is currently under high load, the best available chat completions service URL for model: {model} has a num queue requests of at least {} requests",
+                best_metrics.num_queued_requests
+            );
+            CHAT_COMPLETIONS_TOO_MANY_REQUESTS.add(1, &[KeyValue::new("model", model.to_string())]);
+            return Ok((
+                chat_completions_service_urls[0].0.clone(),
+                StatusCode::TOO_MANY_REQUESTS,
+            ));
+        }
+
+        let best_url = best_metrics.chat_completions_service_url.clone();
+        tracing::info!(
             target = "atoma-service",
-            model = model,
-            "No chat completions service URLs below max capacity found, returning TOO_MANY_REQUESTS status."
+            level = "info",
+            "Best available chat completions service URL for model: {model} is: {best_url} with and {} queue requests",
+            best_metrics.num_queued_requests
         );
 
-        return Ok((String::new(), StatusCode::TOO_MANY_REQUESTS));
+        Ok((best_url, StatusCode::OK))
     }
 
     #[derive(Debug, thiserror::Error, Clone)]
     pub enum ChatCompletionsMetricsError {
+        #[error("Failed to get metrics: {0}")]
+        GetMetricsError(String),
         #[error("No chat completions service urls found for model: {0}")]
         NoChatCompletionsServiceUrlsFound(String),
+        #[error("Invalid metrics value: {0}")]
+        InvalidMetricsValue(String),
+        #[error("Invalid metrics response: {0}")]
+        InvalidMetricsResponse(String),
+        #[error("Failed to create HTTP client: {0}")]
+        FailedToCreateHttpClient(String),
+        #[error("No metrics found for job: {0}")]
+        NoMetricsFound(String),
+    }
+
+    // From implementations to handle conversions from error types to our cloneable error type
+    impl From<reqwest::Error> for ChatCompletionsMetricsError {
+        fn from(err: reqwest::Error) -> Self {
+            Self::GetMetricsError(err.to_string())
+        }
+    }
+
+    impl From<std::num::ParseFloatError> for ChatCompletionsMetricsError {
+        fn from(err: std::num::ParseFloatError) -> Self {
+            Self::InvalidMetricsValue(err.to_string())
+        }
+    }
+
+    impl From<serde_json::Error> for ChatCompletionsMetricsError {
+        fn from(err: serde_json::Error) -> Self {
+            Self::InvalidMetricsResponse(err.to_string())
+        }
+    }
+
+    impl From<prometheus_http_query::Error> for ChatCompletionsMetricsError {
+        fn from(err: prometheus_http_query::Error) -> Self {
+            Self::FailedToCreateHttpClient(err.to_string())
+        }
     }
 }
diff --git a/atoma-service/src/handlers/request_counter.rs b/atoma-service/src/handlers/request_counter.rs
deleted file mode 100644
index 73444fca..00000000
--- a/atoma-service/src/handlers/request_counter.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-use atoma_p2p::metrics::RUNNING_REQUESTS;
-use dashmap::{DashMap, Entry};
-use opentelemetry::KeyValue;
-use tracing::error;
-
-/// A thread-safe request counter that tracks the number of requests being processed for each inference service.
-#[derive(Clone, Debug)]
-pub struct RequestCounter {
-    /// A map that holds the count of running requests for each inference service.
-    running_num_requests: DashMap<String, usize>,
-}
-
-impl Default for RequestCounter {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl RequestCounter {
-    /// Creates a new instance of `RequestCounter`.
-    #[must_use]
-    pub fn new() -> Self {
-        Self {
-            running_num_requests: DashMap::new(),
-        }
-    }
-
-    /// Increments the count for the given key or initializes it to 1 if it does not exist.
-    pub fn increment(&self, key: &str, max_value: usize) -> bool {
-        let mut entry = self
-            .running_num_requests
-            .entry(key.to_string())
-            .or_insert(0);
-        if *entry >= max_value {
-            false
-        } else {
-            *entry += 1;
-            RUNNING_REQUESTS.record(*entry as u64, &[KeyValue::new("service", key.to_string())]);
-            true
-        }
-    }
-
-    /// Decrements the count for the given key. If the count reaches zero, the entry is removed.
-    pub fn decrement(&self, key: &str) {
-        match self.running_num_requests.entry(key.to_string()) {
-            Entry::Occupied(mut entry) => {
-                let count = entry.get_mut();
-                *count -= 1;
-                RUNNING_REQUESTS
-                    .record(*count as u64, &[KeyValue::new("service", key.to_string())]);
-                if *count == 0 {
-                    entry.remove();
-                }
-            }
-            Entry::Vacant(_) => {
-                // This should not happen
-                error!(
-                    target = "atoma-service",
-                    level = "info",
-                    event = "chat-completions-handler",
-                    "Attempted to decrement a non-existent key: {}",
-                    key
-                );
-            }
-        }
-    }
-
-    /// Retrieves the current count for the given key.
-    #[must_use]
-    pub fn get_count(&self, key: &str) -> usize {
-        self.running_num_requests.get(key).map_or(0, |entry| *entry)
-    }
-}
diff --git a/atoma-service/src/server.rs b/atoma-service/src/server.rs
index 7f4638f6..0db7e1bc 100644
--- a/atoma-service/src/server.rs
+++ b/atoma-service/src/server.rs
@@ -54,7 +54,6 @@ use crate::{
             confidential_image_generations_handler, image_generations_handler,
             CONFIDENTIAL_IMAGE_GENERATIONS_PATH, IMAGE_GENERATIONS_PATH,
         },
-        request_counter::RequestCounter,
         stop_streamer::stop_streamer_handler,
     },
     middleware::{
@@ -175,7 +174,7 @@ pub struct AppState {
     /// These URLs point to the external services responsible for performing
     /// AI model chat completions. The application forwards requests to this
     /// service to obtain AI-generated responses.
-    pub chat_completions_service_urls: HashMap<String, Vec<(String, String, usize)>>,
+    pub chat_completions_service_urls: HashMap<String, Vec<(String, String)>>,
 
     /// URL for the embeddings service.
     ///
@@ -205,9 +204,6 @@ pub struct AppState {
 
     /// The Sui address of the clients that are allowed to use fiat.
     pub whitelist_sui_addresses_for_fiat: Vec<String>,
-
-    /// Number of running requests for each inference service.
-    pub running_num_requests: Arc<RequestCounter>,
 }
 
 /// Creates and configures the main router for the application.
diff --git a/atoma-service/src/streamer.rs b/atoma-service/src/streamer.rs
index 9ded19e1..bb1ffc2b 100644
--- a/atoma-service/src/streamer.rs
+++ b/atoma-service/src/streamer.rs
@@ -31,7 +31,6 @@ use crate::{
             CHAT_COMPLETIONS_STREAMING_LATENCY_METRICS, CHAT_COMPLETIONS_TIME_TO_FIRST_TOKEN,
             TOTAL_COMPLETED_REQUESTS,
         },
-        request_counter::RequestCounter,
         update_fiat_amount, update_stack_num_compute_units, USAGE_KEY,
     },
     server::utils,
@@ -144,10 +143,6 @@ pub struct Streamer {
     user_id: Option<i64>,
     /// The user address for the request
     user_address: String,
-    /// A map to keep track of the number of requests currently being processed
-    running_num_requests: Arc<RequestCounter>,
-    /// The URL of the chat completions service
-    chat_completions_service_url: String,
 }
 
 /// Represents the various states of a streaming process
@@ -185,8 +180,6 @@ impl Streamer {
         price_per_one_million_tokens: i64,
         user_id: Option<i64>,
         user_address: String,
-        running_num_requests: Arc<RequestCounter>,
-        chat_completions_service_url: String,
     ) -> Self {
         Self {
             concurrent_requests,
@@ -213,8 +206,6 @@ impl Streamer {
             price_per_one_million_tokens,
             user_id,
             user_address,
-            running_num_requests,
-            chat_completions_service_url,
         }
     }
 
@@ -914,9 +905,6 @@ impl Drop for Streamer {
         )
     )]
     fn drop(&mut self) {
-        self.running_num_requests
-            .decrement(&self.chat_completions_service_url);
-
         if self.is_final_chunk_handled || matches!(self.status, StreamStatus::Failed(_)) {
             TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, self.model.clone())]);
             return;
diff --git a/atoma-service/src/tests.rs b/atoma-service/src/tests.rs
index c5f726cc..c006dbb5 100644
--- a/atoma-service/src/tests.rs
+++ b/atoma-service/src/tests.rs
@@ -34,7 +34,7 @@ mod middleware {
     use crate::{
         handlers::{
             chat_completions::CHAT_COMPLETIONS_PATH, embeddings::EMBEDDINGS_PATH,
-            image_generations::IMAGE_GENERATIONS_PATH, request_counter::RequestCounter,
+            image_generations::IMAGE_GENERATIONS_PATH,
         },
         middleware::{
             confidential_compute_middleware, signature_verification_middleware, verify_permissions,
@@ -341,7 +341,6 @@ mod middleware {
                 address_index: 0,
                 stack_retrieve_sender,
                 whitelist_sui_addresses_for_fiat: vec![],
-                running_num_requests: Arc::new(RequestCounter::new()),
             },
             public_key,
             signature,
diff --git a/config.example.toml b/config.example.toml
index afbc3765..da176dac 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -3,42 +3,34 @@ chat_completions_service_urls = { "Infermatic/Llama-3.3-70B-Instruct-FP8-Dynamic
     [
         "http://chat-completions1:8000",
         "vllm1",
-        256,
     ],
     [
         "http://chat-completions2:8000",
         "vllm2",
-        256,
     ],
     [
         "http://chat-completions3:8000",
         "vllm3",
-        256,
     ],
     [
         "http://chat-completions4:8000",
         "vllm4",
-        256,
     ],
     [
         "http://chat-completions5:8000",
         "vllm5",
-        256,
     ],
     [
         "http://chat-completions6:8000",
         "vllm6",
-        256,
     ],
     [
         "http://chat-completions7:8000",
         "vllm7",
-        256,
     ],
     [
         "http://chat-completions8:8000",
         "vllm8",
-        256,
     ],
 ] }
 embeddings_service_url = "http://embeddings:80"

From dee70b771715d037a46ac357626605cca0b0d7cf Mon Sep 17 00:00:00 2001
From: Martin Stefcek <35243812+Cifko@users.noreply.github.com>
Date: Fri, 30 May 2025 16:14:41 +0200
Subject: [PATCH 04/13] feat: turn on too many requests for a period of time
 (#647)

---
 atoma-bin/atoma_node.rs                        | 2 ++
 atoma-service/src/config.rs                    | 3 +++
 atoma-service/src/handlers/chat_completions.rs | 6 ++++++
 atoma-service/src/handlers/completions.rs      | 6 ++++++
 atoma-service/src/middleware.rs                | 9 +++++++++
 atoma-service/src/server.rs                    | 8 +++++++-
 atoma-service/src/tests.rs                     | 2 ++
 config.example.toml                            | 1 +
 8 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/atoma-bin/atoma_node.rs b/atoma-bin/atoma_node.rs
index 6359881d..1f565724 100644
--- a/atoma-bin/atoma_node.rs
+++ b/atoma-bin/atoma_node.rs
@@ -371,6 +371,8 @@ async fn main() -> Result<()> {
         keystore: Arc::new(keystore),
         address_index,
         whitelist_sui_addresses_for_fiat: config.service.whitelist_sui_addresses_for_fiat,
+        too_many_requests: Arc::new(DashMap::new()),
+        too_many_requests_timeout_ms: u128::from(config.service.too_many_requests_timeout_ms),
     };
 
     let chat_completions_service_urls = app_state
diff --git a/atoma-service/src/config.rs b/atoma-service/src/config.rs
index f7544506..045adba2 100644
--- a/atoma-service/src/config.rs
+++ b/atoma-service/src/config.rs
@@ -57,6 +57,9 @@ pub struct AtomaServiceConfig {
 
     /// List of allowed sui addresses for fiat payments.
     pub whitelist_sui_addresses_for_fiat: Vec<String>,
+
+    /// The timeout for the too many requests error in milliseconds.
+    pub too_many_requests_timeout_ms: u64,
 }
 
 impl AtomaServiceConfig {
diff --git a/atoma-service/src/handlers/chat_completions.rs b/atoma-service/src/handlers/chat_completions.rs
index 89ec218f..b2d4dbbb 100644
--- a/atoma-service/src/handlers/chat_completions.rs
+++ b/atoma-service/src/handlers/chat_completions.rs
@@ -908,6 +908,9 @@ async fn handle_streaming_response(
                 endpoint: endpoint.clone(),
             })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
+        state
+            .too_many_requests
+            .insert(model.to_string(), Instant::now());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -1329,6 +1332,9 @@ pub mod utils {
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
+            state
+                .too_many_requests
+                .insert(model.to_string(), Instant::now());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
diff --git a/atoma-service/src/handlers/completions.rs b/atoma-service/src/handlers/completions.rs
index e3bbc3b5..8b3375f7 100644
--- a/atoma-service/src/handlers/completions.rs
+++ b/atoma-service/src/handlers/completions.rs
@@ -882,6 +882,9 @@ async fn handle_streaming_response(
                 endpoint: endpoint.clone(),
             })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
+        state
+            .too_many_requests
+            .insert(model.to_string(), Instant::now());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -1292,6 +1295,9 @@ pub mod utils {
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
+            state
+                .too_many_requests
+                .insert(model.to_string(), Instant::now());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
diff --git a/atoma-service/src/middleware.rs b/atoma-service/src/middleware.rs
index 8371a089..620ffc12 100644
--- a/atoma-service/src/middleware.rs
+++ b/atoma-service/src/middleware.rs
@@ -811,6 +811,15 @@ pub async fn verify_permissions(
             message: "Model is not a string".to_string(),
             endpoint: endpoint.clone(),
         })?;
+    if let Some(trigger_time) = state.too_many_requests.get(model) {
+        if trigger_time.elapsed().as_millis() < state.too_many_requests_timeout_ms {
+            return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
+                message: "Too many requests".to_string(),
+                endpoint: endpoint.clone(),
+            });
+        }
+        state.too_many_requests.remove(model);
+    }
     if !state.models.contains(&model.to_string()) {
         return Err(AtomaServiceError::InvalidBody {
             message: format!("Model not supported, supported models: {:?}", state.models),
diff --git a/atoma-service/src/server.rs b/atoma-service/src/server.rs
index 0db7e1bc..9575db5f 100644
--- a/atoma-service/src/server.rs
+++ b/atoma-service/src/server.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc, time::Instant};
 
 use atoma_confidential::types::{
     ConfidentialComputeDecryptionRequest, ConfidentialComputeDecryptionResponse,
@@ -204,6 +204,12 @@ pub struct AppState {
 
     /// The Sui address of the clients that are allowed to use fiat.
     pub whitelist_sui_addresses_for_fiat: Vec<String>,
+
+    /// When was the too many requests triggered for each model.
+    pub too_many_requests: Arc<DashMap<String, Instant>>,
+
+    /// The time for which we triiger too many requests since the first occurrence.
+    pub too_many_requests_timeout_ms: u128,
 }
 
 /// Creates and configures the main router for the application.
diff --git a/atoma-service/src/tests.rs b/atoma-service/src/tests.rs
index c006dbb5..a9ff3732 100644
--- a/atoma-service/src/tests.rs
+++ b/atoma-service/src/tests.rs
@@ -341,6 +341,8 @@ mod middleware {
                 address_index: 0,
                 stack_retrieve_sender,
                 whitelist_sui_addresses_for_fiat: vec![],
+                too_many_requests: Arc::new(DashMap::new()),
+                too_many_requests_timeout_ms: 0,
             },
             public_key,
             signature,
diff --git a/config.example.toml b/config.example.toml
index da176dac..744a35e5 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -43,6 +43,7 @@ models                           = [ "Infermatic/Llama-3.3-70B-Instruct-FP8-Dyna
 revisions                        = [ "main" ]
 sentry_dsn                       = ""                                                  # Sentry DSN (for use in sentry, you need to set the Sentry DSN)
 service_bind_address             = "0.0.0.0:3000"
+too_many_requests_timeout_ms     = 2000                                                # Timeout for too many requests flag in milliseconds
 whitelist_sui_addresses_for_fiat = [  ]                                                # Sui addresses that are allowed to use fiat payments
 
 [atoma_sui]

From 94d157b83ef83a5a9952cc09bda79e9ac6b5d22a Mon Sep 17 00:00:00 2001
From: Martin Stefcek <35243812+Cifko@users.noreply.github.com>
Date: Fri, 30 May 2025 17:44:50 +0200
Subject: [PATCH 05/13] feat: add request running cap (#649)

* feat: add request running cap

* fix clippy

---------

Co-authored-by: Jorge Antonio <matroid@outlook.com>
---
 atoma-bin/atoma_node.rs                       |  14 +-
 atoma-service/src/config.rs                   |   4 +-
 .../src/handlers/chat_completions.rs          |  33 ++++-
 atoma-service/src/handlers/completions.rs     |  34 +++--
 atoma-service/src/handlers/mod.rs             | 131 ++++++++++--------
 atoma-service/src/handlers/request_counter.rs |  73 ++++++++++
 atoma-service/src/server.rs                   |   6 +-
 atoma-service/src/streamer.rs                 |  12 ++
 atoma-service/src/tests.rs                    |   3 +-
 config.example.toml                           |   8 ++
 10 files changed, 243 insertions(+), 75 deletions(-)
 create mode 100644 atoma-service/src/handlers/request_counter.rs

diff --git a/atoma-bin/atoma_node.rs b/atoma-bin/atoma_node.rs
index 1f565724..6fb87822 100644
--- a/atoma-bin/atoma_node.rs
+++ b/atoma-bin/atoma_node.rs
@@ -4,7 +4,9 @@ use anyhow::{Context, Result};
 use atoma_confidential::AtomaConfidentialCompute;
 use atoma_daemon::{telemetry, AtomaDaemonConfig, DaemonState};
 use atoma_p2p::{AtomaP2pNode, AtomaP2pNodeConfig};
-use atoma_service::{config::AtomaServiceConfig, server::AppState};
+use atoma_service::{
+    config::AtomaServiceConfig, handlers::request_counter::RequestCounter, server::AppState,
+};
 use atoma_state::{config::AtomaStateManagerConfig, AtomaState, AtomaStateManager};
 use atoma_sui::{client::Client, config::Config, subscriber::Subscriber};
 use atoma_utils::spawn_with_shutdown;
@@ -373,6 +375,7 @@ async fn main() -> Result<()> {
         whitelist_sui_addresses_for_fiat: config.service.whitelist_sui_addresses_for_fiat,
         too_many_requests: Arc::new(DashMap::new()),
         too_many_requests_timeout_ms: u128::from(config.service.too_many_requests_timeout_ms),
+        running_num_requests: Arc::new(RequestCounter::new()),
     };
 
     let chat_completions_service_urls = app_state
@@ -380,7 +383,14 @@ async fn main() -> Result<()> {
         .iter()
         .flat_map(|(model, urls)| {
             urls.iter()
-                .map(|(url, job)| (model.clone(), url.clone(), job.clone()))
+                .map(|(url, job, max_number_of_running_requests)| {
+                    (
+                        model.clone(),
+                        url.clone(),
+                        job.clone(),
+                        *max_number_of_running_requests,
+                    )
+                })
         })
         .collect();
     atoma_service::handlers::inference_service_metrics::start_metrics_updater(
diff --git a/atoma-service/src/config.rs b/atoma-service/src/config.rs
index 045adba2..f7cff89a 100644
--- a/atoma-service/src/config.rs
+++ b/atoma-service/src/config.rs
@@ -9,10 +9,10 @@ use serde::Deserialize;
 /// including URLs for various services and a list of models.
 #[derive(Debug, Deserialize)]
 pub struct AtomaServiceConfig {
-    /// URL for the chat completions service.
+    /// URL for the chat completions service with maximum concurrency settings.
     ///
     /// This field specifies the endpoint for the chat completions service used by the Atoma Service.
-    pub chat_completions_service_urls: HashMap<String, Vec<(String, String)>>,
+    pub chat_completions_service_urls: HashMap<String, Vec<(String, String, usize)>>,
 
     /// URL for the embeddings service.
     ///
diff --git a/atoma-service/src/handlers/chat_completions.rs b/atoma-service/src/handlers/chat_completions.rs
index b2d4dbbb..56f4d785 100644
--- a/atoma-service/src/handlers/chat_completions.rs
+++ b/atoma-service/src/handlers/chat_completions.rs
@@ -49,7 +49,10 @@ use tracing::{debug, info, instrument};
 use utoipa::OpenApi;
 
 use serde::Deserialize;
-use std::time::{Duration, Instant};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
+};
 
 use crate::{
     error::AtomaServiceError,
@@ -901,12 +904,16 @@ async fn handle_streaming_response(
             }
         })?;
     let (chat_completions_service_url, status_code) =
-        get_best_available_chat_completions_service_url(chat_completions_service_urls, model)
-            .await
-            .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
-                message: e.to_string(),
-                endpoint: endpoint.clone(),
-            })?;
+        get_best_available_chat_completions_service_url(
+            &state.running_num_requests,
+            chat_completions_service_urls,
+            &model.to_lowercase(),
+        )
+        .await
+        .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
+            message: e.to_string(),
+            endpoint: endpoint.clone(),
+        })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
         state
             .too_many_requests
@@ -927,6 +934,9 @@ async fn handle_streaming_response(
         .send()
         .await
         .map_err(|e| {
+            state
+                .running_num_requests
+                .decrement(&chat_completions_service_url);
             AtomaServiceError::InternalError {
                 message: format!(
                     "Error sending request to inference service, for request with payload hash: {:?}, and stack small id: {:?}, with error: {}",
@@ -939,6 +949,9 @@ async fn handle_streaming_response(
         })?;
 
     if !response.status().is_success() {
+        state
+            .running_num_requests
+            .decrement(&chat_completions_service_url);
         let status = response.status();
         let bytes = response
             .bytes()
@@ -992,6 +1005,8 @@ async fn handle_streaming_response(
         price_per_one_million_compute_units,
         user_id,
         user_address,
+        Arc::clone(&state.running_num_requests),
+        chat_completions_service_url,
     ))
     .keep_alive(
         axum::response::sse::KeepAlive::new()
@@ -1323,6 +1338,7 @@ pub mod utils {
             })?;
         let (chat_completions_service_url, status_code) =
             get_best_available_chat_completions_service_url(
+                &state.running_num_requests,
                 chat_completions_service_url_services,
                 model,
             )
@@ -1348,6 +1364,9 @@ pub mod utils {
             .json(&payload)
             .send()
             .await;
+        state
+            .running_num_requests
+            .decrement(&chat_completions_service_url);
         let response = response.map_err(|e| {
             AtomaServiceError::InternalError {
                 message: format!(
diff --git a/atoma-service/src/handlers/completions.rs b/atoma-service/src/handlers/completions.rs
index 8b3375f7..a2da3069 100644
--- a/atoma-service/src/handlers/completions.rs
+++ b/atoma-service/src/handlers/completions.rs
@@ -37,7 +37,10 @@ use tracing::{debug, info, instrument};
 use utoipa::OpenApi;
 
 use serde::Deserialize;
-use std::time::{Duration, Instant};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
+};
 
 use crate::{
     error::AtomaServiceError,
@@ -874,13 +877,16 @@ async fn handle_streaming_response(
                 endpoint: endpoint.clone(),
             }
         })?;
-    let (completions_service_url, status_code) =
-        get_best_available_chat_completions_service_url(chat_completions_service_urls, model)
-            .await
-            .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
-                message: e.to_string(),
-                endpoint: endpoint.clone(),
-            })?;
+    let (completions_service_url, status_code) = get_best_available_chat_completions_service_url(
+        &state.running_num_requests,
+        chat_completions_service_urls,
+        model,
+    )
+    .await
+    .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
+        message: e.to_string(),
+        endpoint: endpoint.clone(),
+    })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
         state
             .too_many_requests
@@ -898,6 +904,9 @@ async fn handle_streaming_response(
         .send()
         .await
         .map_err(|e| {
+            state
+                .running_num_requests
+                .decrement(&completions_service_url);
             AtomaServiceError::InternalError {
                 message: format!(
                     "Error sending request to inference service, for request with payload hash: {:?}, and stack small id: {:?}, with error: {}",
@@ -910,6 +919,9 @@ async fn handle_streaming_response(
     })?;
 
     if !response.status().is_success() {
+        state
+            .running_num_requests
+            .decrement(&completions_service_url);
         let status = response.status();
         let bytes = response
             .bytes()
@@ -962,6 +974,8 @@ async fn handle_streaming_response(
         price_per_one_million_tokens,
         user_id,
         user_address,
+        Arc::clone(&state.running_num_requests),
+        completions_service_url,
     ))
     .keep_alive(
         axum::response::sse::KeepAlive::new()
@@ -1286,6 +1300,7 @@ pub mod utils {
             })?;
         let (completions_service_url, status_code) =
             get_best_available_chat_completions_service_url(
+                &state.running_num_requests,
                 completions_service_url_services,
                 model,
             )
@@ -1308,6 +1323,9 @@ pub mod utils {
             .json(&payload)
             .send()
             .await;
+        state
+            .running_num_requests
+            .decrement(&completions_service_url);
         let response = response
         .map_err(|e| {
             AtomaServiceError::InternalError {
diff --git a/atoma-service/src/handlers/mod.rs b/atoma-service/src/handlers/mod.rs
index aa79a7e4..201d2213 100644
--- a/atoma-service/src/handlers/mod.rs
+++ b/atoma-service/src/handlers/mod.rs
@@ -4,6 +4,7 @@ pub mod completions;
 pub mod embeddings;
 pub mod image_generations;
 pub mod metrics;
+pub mod request_counter;
 pub mod request_model;
 pub mod stop_streamer;
 
@@ -572,7 +573,6 @@ pub fn handle_status_code_error(
 
 pub mod inference_service_metrics {
     use futures::future::join_all;
-    use opentelemetry::KeyValue;
     use prometheus_parse::Scrape;
     use prometheus_parse::Value;
     use rand::Rng;
@@ -581,12 +581,13 @@ pub mod inference_service_metrics {
     use std::time::Duration;
     use tokio::sync::RwLock;
     use tokio::time;
+    use tracing::info;
 
-    use crate::handlers::metrics::CHAT_COMPLETIONS_TOO_MANY_REQUESTS;
+    use crate::handlers::InferenceService;
     use hyper::StatusCode;
-    use tracing::{info, instrument};
+    use tracing::instrument;
 
-    use super::InferenceService;
+    use super::request_counter::RequestCounter;
 
     pub type Result<T> = std::result::Result<T, ChatCompletionsMetricsError>;
     type MetricValue = ChatCompletionsMetrics;
@@ -620,6 +621,8 @@ pub mod inference_service_metrics {
         num_queued_requests: f64,
         /// The number of running requests
         num_running_requests: f64,
+        /// The maximum number of running requests allowed for this url.
+        max_number_of_running_requests: usize,
     }
 
     /// Cache structure to store metrics
@@ -660,10 +663,10 @@ pub mod inference_service_metrics {
     /// * `metrics_update_interval` - The interval in seconds to update the metrics.
     #[instrument(level = "info", skip_all)]
     pub fn start_metrics_updater(
-        chat_completions_service_urls: Vec<(String, String, String)>,
+        chat_completions_service_urls: Vec<(String, String, String, usize)>,
         metrics_update_interval: Option<u64>,
     ) {
-        type ChatCompletionsServiceUrls = Vec<(String, String, String)>;
+        type ChatCompletionsServiceUrls = Vec<(String, String, String, usize)>;
         info!(
             target = "atoma-service",
             module = "inference_service_metrics",
@@ -676,7 +679,7 @@ pub mod inference_service_metrics {
         ) = chat_completions_service_urls
             .iter()
             .cloned()
-            .partition(|(_, _, job)| job.contains("vllm"));
+            .partition(|(_, _, job, _)| job.contains("vllm"));
         info!(
             target = "atoma-service",
             module = "inference_service_metrics",
@@ -756,12 +759,12 @@ pub mod inference_service_metrics {
     ///     gracefully.
     async fn get_metrics(
         inference_service: &InferenceService,
-        jobs_with_url: &[(String, String, String)], // (model, url, job)
+        jobs_with_url: &[(String, String, String, usize)], // (model, url, job, max_concurrent_requests)
     ) -> Vec<Result<ChatCompletionsMetrics>> {
         let tasks =
             jobs_with_url
                 .iter()
-                .map(|(model, chat_completions_service_url, job)| async move {
+                .map(|(model, chat_completions_service_url, job, max_number_of_running_requests)| async move {
                     let response = HTTP_CLIENT
                         .get(format!("{chat_completions_service_url}/metrics"))
                         .send()
@@ -790,6 +793,7 @@ pub mod inference_service_metrics {
                         chat_completions_service_url: chat_completions_service_url.clone(),
                         num_queued_requests,
                         num_running_requests,
+                        max_number_of_running_requests:*max_number_of_running_requests
                     })
                 });
         join_all(tasks).await
@@ -895,14 +899,18 @@ pub mod inference_service_metrics {
     #[instrument(level = "info", skip_all, fields(model=model))]
     #[allow(clippy::float_cmp)]
     pub async fn get_best_available_chat_completions_service_url(
-        chat_completions_service_urls: &[(String, String)],
+        running_num_requests: &RequestCounter,
+        chat_completions_service_urls: &[(String, String, usize)], // (url, job, max_concurrent_requests)
         model: &str,
     ) -> Result<(String, StatusCode)> {
-        const MAX_ALLOWED_NUM_QUEUED_REQUESTS: f64 = 1.0; // Default to 1 request
-
-        type ChatCompletionsServiceUrls = Vec<(String, String)>;
+        type ChatCompletionsServiceUrls = Vec<(String, String, usize)>;
 
         if chat_completions_service_urls.is_empty() {
+            tracing::warn!(
+                target = "atoma-service",
+                model = model,
+                "No chat completions service URLs provided for model."
+            );
             return Err(
                 ChatCompletionsMetricsError::NoChatCompletionsServiceUrlsFound(model.to_string()),
             );
@@ -919,7 +927,7 @@ pub mod inference_service_metrics {
         ) = chat_completions_service_urls
             .iter()
             .cloned()
-            .partition(|(_, job)| job.contains("vllm"));
+            .partition(|(_, job, _)| job.contains("vllm"));
 
         tracing::debug!(
             target = "atoma-service",
@@ -940,11 +948,22 @@ pub mod inference_service_metrics {
                 level = "info",
                 "No cached vLLM metrics, getting them directly"
             );
-            let vllm_chat_completions_service_urls_with_model: Vec<(String, String, String)> =
-                vllm_chat_completions_service_urls
-                    .iter()
-                    .map(|(url, job)| (model.to_string(), url.clone(), job.clone()))
-                    .collect();
+            let vllm_chat_completions_service_urls_with_model: Vec<(
+                String,
+                String,
+                String,
+                usize,
+            )> = vllm_chat_completions_service_urls
+                .iter()
+                .map(|(url, job, max_concurrent_requests)| {
+                    (
+                        model.to_string(),
+                        url.clone(),
+                        job.clone(),
+                        *max_concurrent_requests,
+                    )
+                })
+                .collect();
             get_metrics(
                 &InferenceService::Vllm,
                 &vllm_chat_completions_service_urls_with_model,
@@ -962,11 +981,22 @@ pub mod inference_service_metrics {
                 level = "info",
                 "No cached SgLang metrics, getting them directly"
             );
-            let sglang_chat_completions_service_urls_with_model: Vec<(String, String, String)> =
-                sglang_chat_completions_service_urls
-                    .iter()
-                    .map(|(url, job)| (model.to_string(), url.clone(), job.clone()))
-                    .collect();
+            let sglang_chat_completions_service_urls_with_model: Vec<(
+                String,
+                String,
+                String,
+                usize,
+            )> = sglang_chat_completions_service_urls
+                .iter()
+                .map(|(url, job, max_concurrent_requests)| {
+                    (
+                        model.to_string(),
+                        url.clone(),
+                        job.clone(),
+                        *max_concurrent_requests,
+                    )
+                })
+                .collect();
             get_metrics(
                 &InferenceService::SgLang,
                 &sglang_chat_completions_service_urls_with_model,
@@ -989,6 +1019,7 @@ pub mod inference_service_metrics {
                     chat_completions_service_url,
                     num_queued_requests,
                     num_running_requests,
+                    max_number_of_running_requests,
                 }) => {
                     tracing::info!(
                         target = "atoma-service",
@@ -1014,6 +1045,7 @@ pub mod inference_service_metrics {
                         chat_completions_service_url,
                         num_queued_requests,
                         num_running_requests,
+                        max_number_of_running_requests,
                     });
                 }
                 Err(e) => {
@@ -1040,39 +1072,30 @@ pub mod inference_service_metrics {
         }
 
         // Select the best available chat completions service URL based on the number of queued and running requests.
-        let best_metrics = metrics_results
-            .iter()
-            .min_by_key(|metric| {
-                (
-                    metric.num_queued_requests as i64,
-                    metric.num_running_requests as i64,
-                )
-            })
-            .unwrap();
+        metrics_results.sort_by_key(|metric| {
+            (
+                metric.num_queued_requests as i64,
+                metric.num_running_requests as i64,
+            )
+        });
 
-        if best_metrics.num_queued_requests >= MAX_ALLOWED_NUM_QUEUED_REQUESTS {
-            tracing::warn!(
-                target = "atoma-service",
-                level = "warn",
-                "Node is currently under high load, the best available chat completions service URL for model: {model} has a num queue requests of at least {} requests",
-                best_metrics.num_queued_requests
-            );
-            CHAT_COMPLETIONS_TOO_MANY_REQUESTS.add(1, &[KeyValue::new("model", model.to_string())]);
-            return Ok((
-                chat_completions_service_urls[0].0.clone(),
-                StatusCode::TOO_MANY_REQUESTS,
-            ));
+        for metric in metrics_results {
+            if running_num_requests.increment(
+                &metric.chat_completions_service_url,
+                metric.max_number_of_running_requests,
+            ) {
+                let best_url = metric.chat_completions_service_url.clone();
+                tracing::info!(
+                    target = "atoma-service",
+                    level = "info",
+                    "Best available chat completions service URL for model: {model} is: {best_url} with and {} queue requests",
+                    metric.num_queued_requests
+                );
+                return Ok((best_url, StatusCode::OK));
+            }
         }
 
-        let best_url = best_metrics.chat_completions_service_url.clone();
-        tracing::info!(
-            target = "atoma-service",
-            level = "info",
-            "Best available chat completions service URL for model: {model} is: {best_url} with and {} queue requests",
-            best_metrics.num_queued_requests
-        );
-
-        Ok((best_url, StatusCode::OK))
+        return Ok((String::new(), StatusCode::TOO_MANY_REQUESTS));
     }
 
     #[derive(Debug, thiserror::Error, Clone)]
diff --git a/atoma-service/src/handlers/request_counter.rs b/atoma-service/src/handlers/request_counter.rs
new file mode 100644
index 00000000..73444fca
--- /dev/null
+++ b/atoma-service/src/handlers/request_counter.rs
@@ -0,0 +1,73 @@
+use atoma_p2p::metrics::RUNNING_REQUESTS;
+use dashmap::{DashMap, Entry};
+use opentelemetry::KeyValue;
+use tracing::error;
+
+/// A thread-safe request counter that tracks the number of requests being processed for each inference service.
+#[derive(Clone, Debug)]
+pub struct RequestCounter {
+    /// A map that holds the count of running requests for each inference service.
+    running_num_requests: DashMap<String, usize>,
+}
+
+impl Default for RequestCounter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RequestCounter {
+    /// Creates a new instance of `RequestCounter`.
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            running_num_requests: DashMap::new(),
+        }
+    }
+
+    /// Increments the count for the given key or initializes it to 1 if it does not exist.
+    pub fn increment(&self, key: &str, max_value: usize) -> bool {
+        let mut entry = self
+            .running_num_requests
+            .entry(key.to_string())
+            .or_insert(0);
+        if *entry >= max_value {
+            false
+        } else {
+            *entry += 1;
+            RUNNING_REQUESTS.record(*entry as u64, &[KeyValue::new("service", key.to_string())]);
+            true
+        }
+    }
+
+    /// Decrements the count for the given key. If the count reaches zero, the entry is removed.
+    pub fn decrement(&self, key: &str) {
+        match self.running_num_requests.entry(key.to_string()) {
+            Entry::Occupied(mut entry) => {
+                let count = entry.get_mut();
+                *count -= 1;
+                RUNNING_REQUESTS
+                    .record(*count as u64, &[KeyValue::new("service", key.to_string())]);
+                if *count == 0 {
+                    entry.remove();
+                }
+            }
+            Entry::Vacant(_) => {
+                // This should not happen
+                error!(
+                    target = "atoma-service",
+                    level = "info",
+                    event = "chat-completions-handler",
+                    "Attempted to decrement a non-existent key: {}",
+                    key
+                );
+            }
+        }
+    }
+
+    /// Retrieves the current count for the given key.
+    #[must_use]
+    pub fn get_count(&self, key: &str) -> usize {
+        self.running_num_requests.get(key).map_or(0, |entry| *entry)
+    }
+}
diff --git a/atoma-service/src/server.rs b/atoma-service/src/server.rs
index 9575db5f..125a9d9e 100644
--- a/atoma-service/src/server.rs
+++ b/atoma-service/src/server.rs
@@ -54,6 +54,7 @@ use crate::{
             confidential_image_generations_handler, image_generations_handler,
             CONFIDENTIAL_IMAGE_GENERATIONS_PATH, IMAGE_GENERATIONS_PATH,
         },
+        request_counter::RequestCounter,
         stop_streamer::stop_streamer_handler,
     },
     middleware::{
@@ -174,7 +175,7 @@ pub struct AppState {
     /// These URLs point to the external services responsible for performing
     /// AI model chat completions. The application forwards requests to this
     /// service to obtain AI-generated responses.
-    pub chat_completions_service_urls: HashMap<String, Vec<(String, String)>>,
+    pub chat_completions_service_urls: HashMap<String, Vec<(String, String, usize)>>,
 
     /// URL for the embeddings service.
     ///
@@ -210,6 +211,9 @@ pub struct AppState {
 
     /// The time for which we triiger too many requests since the first occurrence.
     pub too_many_requests_timeout_ms: u128,
+
+    /// Number of running requests for each inference service.
+    pub running_num_requests: Arc<RequestCounter>,
 }
 
 /// Creates and configures the main router for the application.
diff --git a/atoma-service/src/streamer.rs b/atoma-service/src/streamer.rs
index bb1ffc2b..9ded19e1 100644
--- a/atoma-service/src/streamer.rs
+++ b/atoma-service/src/streamer.rs
@@ -31,6 +31,7 @@ use crate::{
             CHAT_COMPLETIONS_STREAMING_LATENCY_METRICS, CHAT_COMPLETIONS_TIME_TO_FIRST_TOKEN,
             TOTAL_COMPLETED_REQUESTS,
         },
+        request_counter::RequestCounter,
         update_fiat_amount, update_stack_num_compute_units, USAGE_KEY,
     },
     server::utils,
@@ -143,6 +144,10 @@ pub struct Streamer {
     user_id: Option<i64>,
     /// The user address for the request
     user_address: String,
+    /// A map to keep track of the number of requests currently being processed
+    running_num_requests: Arc<RequestCounter>,
+    /// The URL of the chat completions service
+    chat_completions_service_url: String,
 }
 
 /// Represents the various states of a streaming process
@@ -180,6 +185,8 @@ impl Streamer {
         price_per_one_million_tokens: i64,
         user_id: Option<i64>,
         user_address: String,
+        running_num_requests: Arc<RequestCounter>,
+        chat_completions_service_url: String,
     ) -> Self {
         Self {
             concurrent_requests,
@@ -206,6 +213,8 @@ impl Streamer {
             price_per_one_million_tokens,
             user_id,
             user_address,
+            running_num_requests,
+            chat_completions_service_url,
         }
     }
 
@@ -905,6 +914,9 @@ impl Drop for Streamer {
         )
     )]
     fn drop(&mut self) {
+        self.running_num_requests
+            .decrement(&self.chat_completions_service_url);
+
         if self.is_final_chunk_handled || matches!(self.status, StreamStatus::Failed(_)) {
             TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, self.model.clone())]);
             return;
diff --git a/atoma-service/src/tests.rs b/atoma-service/src/tests.rs
index a9ff3732..9410bfca 100644
--- a/atoma-service/src/tests.rs
+++ b/atoma-service/src/tests.rs
@@ -34,7 +34,7 @@ mod middleware {
     use crate::{
         handlers::{
             chat_completions::CHAT_COMPLETIONS_PATH, embeddings::EMBEDDINGS_PATH,
-            image_generations::IMAGE_GENERATIONS_PATH,
+            image_generations::IMAGE_GENERATIONS_PATH, request_counter::RequestCounter,
         },
         middleware::{
             confidential_compute_middleware, signature_verification_middleware, verify_permissions,
@@ -343,6 +343,7 @@ mod middleware {
                 whitelist_sui_addresses_for_fiat: vec![],
                 too_many_requests: Arc::new(DashMap::new()),
                 too_many_requests_timeout_ms: 0,
+                running_num_requests: Arc::new(RequestCounter::new()),
             },
             public_key,
             signature,
diff --git a/config.example.toml b/config.example.toml
index 744a35e5..3aa7c708 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -3,34 +3,42 @@ chat_completions_service_urls = { "Infermatic/Llama-3.3-70B-Instruct-FP8-Dynamic
     [
         "http://chat-completions1:8000",
         "vllm1",
+        256,
     ],
     [
         "http://chat-completions2:8000",
         "vllm2",
+        256,
     ],
     [
         "http://chat-completions3:8000",
         "vllm3",
+        256,
     ],
     [
         "http://chat-completions4:8000",
         "vllm4",
+        256,
     ],
     [
         "http://chat-completions5:8000",
         "vllm5",
+        256,
     ],
     [
         "http://chat-completions6:8000",
         "vllm6",
+        256,
     ],
     [
         "http://chat-completions7:8000",
         "vllm7",
+        256,
     ],
     [
         "http://chat-completions8:8000",
         "vllm8",
+        256,
     ],
 ] }
 embeddings_service_url = "http://embeddings:80"

From 1bbb4235e0db0897e3836b6772e777203f11ae63 Mon Sep 17 00:00:00 2001
From: Jorge Antonio <matroid@outlook.com>
Date: Fri, 30 May 2025 17:38:00 +0100
Subject: [PATCH 06/13] refactor num running requests for prometheus check

---
 atoma-service/src/handlers/metrics.rs         | 17 ++++++++++++++++-
 atoma-service/src/handlers/mod.rs             |  2 +-
 atoma-service/src/handlers/request_counter.rs |  8 +++++---
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/atoma-service/src/handlers/metrics.rs b/atoma-service/src/handlers/metrics.rs
index 7515e0eb..f10a1f5d 100644
--- a/atoma-service/src/handlers/metrics.rs
+++ b/atoma-service/src/handlers/metrics.rs
@@ -1,6 +1,6 @@
 use opentelemetry::{
     global,
-    metrics::{Counter, Histogram, Meter, UpDownCounter},
+    metrics::{Counter, Gauge, Histogram, Meter, UpDownCounter},
 };
 use std::sync::LazyLock;
 
@@ -616,3 +616,18 @@ pub static SIGNATURE_VERIFICATION_MIDDLEWARE_SUCCESSFUL_TIME: LazyLock<Histogram
         .with_boundaries(LATENCY_HISTOGRAM_BUCKETS.to_vec())
         .build()
     });
+
+/// Gauge metric that tracks the number of running requests.
+///
+/// # Metric Details
+/// - Name: `atoma_num_running_requests`
+/// - Type: Gauge
+/// - Labels: `model`
+/// - Labels: `privacy_level`
+pub static NUM_RUNNING_REQUESTS: LazyLock<Gauge<u64>> = LazyLock::new(|| {
+    GLOBAL_METER
+        .u64_gauge("atoma_num_running_requests")
+        .with_description("Number of running requests")
+        .with_unit("requests")
+        .build()
+});
diff --git a/atoma-service/src/handlers/mod.rs b/atoma-service/src/handlers/mod.rs
index 201d2213..7482dd53 100644
--- a/atoma-service/src/handlers/mod.rs
+++ b/atoma-service/src/handlers/mod.rs
@@ -793,7 +793,7 @@ pub mod inference_service_metrics {
                         chat_completions_service_url: chat_completions_service_url.clone(),
                         num_queued_requests,
                         num_running_requests,
-                        max_number_of_running_requests:*max_number_of_running_requests
+                        max_number_of_running_requests: *max_number_of_running_requests,
                     })
                 });
         join_all(tasks).await
diff --git a/atoma-service/src/handlers/request_counter.rs b/atoma-service/src/handlers/request_counter.rs
index 73444fca..a0570e85 100644
--- a/atoma-service/src/handlers/request_counter.rs
+++ b/atoma-service/src/handlers/request_counter.rs
@@ -1,8 +1,9 @@
-use atoma_p2p::metrics::RUNNING_REQUESTS;
 use dashmap::{DashMap, Entry};
 use opentelemetry::KeyValue;
 use tracing::error;
 
+use super::metrics::NUM_RUNNING_REQUESTS;
+
 /// A thread-safe request counter that tracks the number of requests being processed for each inference service.
 #[derive(Clone, Debug)]
 pub struct RequestCounter {
@@ -35,7 +36,8 @@ impl RequestCounter {
             false
         } else {
             *entry += 1;
-            RUNNING_REQUESTS.record(*entry as u64, &[KeyValue::new("service", key.to_string())]);
+            NUM_RUNNING_REQUESTS
+                .record(*entry as u64, &[KeyValue::new("service", key.to_string())]);
             true
         }
     }
@@ -46,7 +48,7 @@ impl RequestCounter {
             Entry::Occupied(mut entry) => {
                 let count = entry.get_mut();
                 *count -= 1;
-                RUNNING_REQUESTS
+                NUM_RUNNING_REQUESTS
                     .record(*count as u64, &[KeyValue::new("service", key.to_string())]);
                 if *count == 0 {
                     entry.remove();

From 4e9f9697c9a2a97b52af37418abd091dc82d0950 Mon Sep 17 00:00:00 2001
From: Jorge Antonio <matroid@outlook.com>
Date: Fri, 30 May 2025 17:51:40 +0100
Subject: [PATCH 07/13] logs

---
 atoma-service/src/middleware.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/atoma-service/src/middleware.rs b/atoma-service/src/middleware.rs
index 620ffc12..bdc32318 100644
--- a/atoma-service/src/middleware.rs
+++ b/atoma-service/src/middleware.rs
@@ -813,6 +813,13 @@ pub async fn verify_permissions(
         })?;
     if let Some(trigger_time) = state.too_many_requests.get(model) {
         if trigger_time.elapsed().as_millis() < state.too_many_requests_timeout_ms {
+            tracing::info!(
+                target = "atoma-service",
+                level = "info",
+                "Too many requests for model: {model}, endpoint: {endpoint}, elapsed trigger time: {} and timeout: {}",
+                trigger_time.elapsed().as_millis(),
+                state.too_many_requests_timeout_ms
+            );
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.clone(),

From 03884a49c074500ea4106cc4fff38294f25be935 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= <matroid@outlook.com>
Date: Mon, 2 Jun 2025 09:06:52 +0100
Subject: [PATCH 08/13] handle deadlock for too many requests timeout trigger
 check (#650)

---
 atoma-service/src/middleware.rs | 69 +++++++++++++++++++++++++--------
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/atoma-service/src/middleware.rs b/atoma-service/src/middleware.rs
index bdc32318..04fd20fc 100644
--- a/atoma-service/src/middleware.rs
+++ b/atoma-service/src/middleware.rs
@@ -811,22 +811,7 @@ pub async fn verify_permissions(
             message: "Model is not a string".to_string(),
             endpoint: endpoint.clone(),
         })?;
-    if let Some(trigger_time) = state.too_many_requests.get(model) {
-        if trigger_time.elapsed().as_millis() < state.too_many_requests_timeout_ms {
-            tracing::info!(
-                target = "atoma-service",
-                level = "info",
-                "Too many requests for model: {model}, endpoint: {endpoint}, elapsed trigger time: {} and timeout: {}",
-                trigger_time.elapsed().as_millis(),
-                state.too_many_requests_timeout_ms
-            );
-            return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
-                message: "Too many requests".to_string(),
-                endpoint: endpoint.clone(),
-            });
-        }
-        state.too_many_requests.remove(model);
-    }
+    utils::check_if_too_many_requests(&state, model, &endpoint)?;
     if !state.models.contains(&model.to_string()) {
         return Err(AtomaServiceError::InvalidBody {
             message: format!("Model not supported, supported models: {:?}", state.models),
@@ -1603,4 +1588,56 @@ pub mod utils {
         }
         Ok(())
     }
+
+    /// Checks if the model has too many requests.
+    ///
+    /// This function checks if the model has too many requests by checking if the elapsed time since the first occurrence is less than the timeout.
+    ///
+    /// # Arguments
+    /// * `state` - The application state containing the too many requests map
+    /// * `model` - The model to check
+    /// * `endpoint` - The API endpoint path being accessed (used for error context)
+    ///
+    /// # Returns
+    /// * `Ok(())` - If the model has too many requests
+    /// * `Err(AtomaServiceError)` - If the model has too many requests
+    ///
+    /// # Errors
+    /// This function will return an error if:
+    /// - The model has too many requests
+    /// - The elapsed time since the first occurrence is less than the timeout
+    #[instrument(level = "info", skip_all, err)]
+    pub fn check_if_too_many_requests(
+        state: &AppState,
+        model: &str,
+        endpoint: &str,
+    ) -> Result<(), AtomaServiceError> {
+        match state.too_many_requests.entry(model.to_string()) {
+            dashmap::mapref::entry::Entry::Occupied(occupied_entry) => {
+                let elapsed_ms = occupied_entry.get().elapsed().as_millis();
+
+                if elapsed_ms < state.too_many_requests_timeout_ms {
+                    tracing::info!(
+                            target = "atoma-service",
+                            level = "info",
+                            "Too many requests for model: {model}, endpoint: {endpoint}, elapsed trigger time: {elapsed_ms} and timeout: {}",
+                            state.too_many_requests_timeout_ms
+                        );
+                    return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
+                        message: "Too many requests".to_string(),
+                        endpoint: endpoint.to_string(),
+                    });
+                }
+                occupied_entry.remove();
+            }
+            dashmap::mapref::entry::Entry::Vacant(_) => {
+                tracing::debug!(
+                    target = "atoma-service",
+                    level = "debug",
+                    "Model is not in the `too_many_requests` map, so no action is needed here. Processing can continue."
+                );
+            }
+        }
+        Ok(())
+    }
 }

From 6b669a785f0682cc285777a3502d7b85e9d1bdb7 Mon Sep 17 00:00:00 2001
From: Martin Stefcek <35243812+Cifko@users.noreply.github.com>
Date: Mon, 2 Jun 2025 15:38:23 +0200
Subject: [PATCH 09/13] feat: add mem usage (#651)

* feat: add memusage to get_metrics

* add lower threshold for disabling the flag

* fix clippy

* address 2 comments

* add values to config

* fix

* fix tests

* fix name
---
 atoma-bin/atoma_node.rs                       |   4 +-
 atoma-service/src/config.rs                   |   5 +
 .../src/handlers/chat_completions.rs          |  10 +-
 atoma-service/src/handlers/completions.rs     |  10 +-
 atoma-service/src/handlers/mod.rs             | 225 +++++++++++++-----
 atoma-service/src/middleware.rs               |  55 +++--
 atoma-service/src/server.rs                   |  12 +-
 atoma-service/src/tests.rs                    |   4 +-
 config.example.toml                           |   2 +
 9 files changed, 229 insertions(+), 98 deletions(-)

diff --git a/atoma-bin/atoma_node.rs b/atoma-bin/atoma_node.rs
index 6fb87822..16f87281 100644
--- a/atoma-bin/atoma_node.rs
+++ b/atoma-bin/atoma_node.rs
@@ -373,9 +373,11 @@ async fn main() -> Result<()> {
         keystore: Arc::new(keystore),
         address_index,
         whitelist_sui_addresses_for_fiat: config.service.whitelist_sui_addresses_for_fiat,
-        too_many_requests: Arc::new(DashMap::new()),
+        too_many_requests: Arc::new(DashSet::new()),
         too_many_requests_timeout_ms: u128::from(config.service.too_many_requests_timeout_ms),
         running_num_requests: Arc::new(RequestCounter::new()),
+        memory_lower_threshold: config.service.memory_lower_threshold,
+        memory_upper_threshold: config.service.memory_upper_threshold,
     };
 
     let chat_completions_service_urls = app_state
diff --git a/atoma-service/src/config.rs b/atoma-service/src/config.rs
index f7cff89a..e6725ff3 100644
--- a/atoma-service/src/config.rs
+++ b/atoma-service/src/config.rs
@@ -60,6 +60,11 @@ pub struct AtomaServiceConfig {
 
     /// The timeout for the too many requests error in milliseconds.
     pub too_many_requests_timeout_ms: u64,
+
+    ///Lower threshold for memory usage, if the memory usage goes below this value, the service will not be considered overloaded
+    pub memory_lower_threshold: f64,
+    /// Upper threshold for memory usage, if the memory usage goes above this value, the service will be considered overloaded
+    pub memory_upper_threshold: f64,
 }
 
 impl AtomaServiceConfig {
diff --git a/atoma-service/src/handlers/chat_completions.rs b/atoma-service/src/handlers/chat_completions.rs
index 56f4d785..20fa6ae9 100644
--- a/atoma-service/src/handlers/chat_completions.rs
+++ b/atoma-service/src/handlers/chat_completions.rs
@@ -908,6 +908,7 @@ async fn handle_streaming_response(
             &state.running_num_requests,
             chat_completions_service_urls,
             &model.to_lowercase(),
+            state.memory_upper_threshold,
         )
         .await
         .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
@@ -915,9 +916,7 @@ async fn handle_streaming_response(
             endpoint: endpoint.clone(),
         })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
-        state
-            .too_many_requests
-            .insert(model.to_string(), Instant::now());
+        state.too_many_requests.insert(model.to_string());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -1341,6 +1340,7 @@ pub mod utils {
                 &state.running_num_requests,
                 chat_completions_service_url_services,
                 model,
+                state.memory_upper_threshold,
             )
             .await
             .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
@@ -1348,9 +1348,7 @@ pub mod utils {
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
-            state
-                .too_many_requests
-                .insert(model.to_string(), Instant::now());
+            state.too_many_requests.insert(model.to_string());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
diff --git a/atoma-service/src/handlers/completions.rs b/atoma-service/src/handlers/completions.rs
index a2da3069..0d80a52f 100644
--- a/atoma-service/src/handlers/completions.rs
+++ b/atoma-service/src/handlers/completions.rs
@@ -881,6 +881,7 @@ async fn handle_streaming_response(
         &state.running_num_requests,
         chat_completions_service_urls,
         model,
+        state.memory_upper_threshold,
     )
     .await
     .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
@@ -888,9 +889,7 @@ async fn handle_streaming_response(
         endpoint: endpoint.clone(),
     })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
-        state
-            .too_many_requests
-            .insert(model.to_string(), Instant::now());
+        state.too_many_requests.insert(model.to_string());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -1303,6 +1302,7 @@ pub mod utils {
                 &state.running_num_requests,
                 completions_service_url_services,
                 model,
+                state.memory_upper_threshold,
             )
             .await
             .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
@@ -1310,9 +1310,7 @@ pub mod utils {
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
-            state
-                .too_many_requests
-                .insert(model.to_string(), Instant::now());
+            state.too_many_requests.insert(model.to_string());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
diff --git a/atoma-service/src/handlers/mod.rs b/atoma-service/src/handlers/mod.rs
index 7482dd53..085163dd 100644
--- a/atoma-service/src/handlers/mod.rs
+++ b/atoma-service/src/handlers/mod.rs
@@ -53,9 +53,11 @@ pub const COMPLETION_TOKENS_KEY: &str = "completion_tokens";
 
 const VLLM_RUNNING_REQUESTS_QUERY: &str = "num_requests_running";
 const VLLM_QUEUED_REQUESTS_QUERY: &str = "num_requests_waiting";
+const VLLM_MEMORY_USAGE_QUERY: &str = "gpu_cache_usage_perc";
 const VLLM_SERVICE_PREFIX: &str = "vllm:";
 const SGLANG_RUNNING_REQUESTS_QUERY: &str = "num_running_reqs";
 const SGLANG_QUEUED_REQUESTS_QUERY: &str = "num_queue_reqs";
+const SGLANG_MEMORY_USAGE_QUERY: &str = "token_usage";
 const SGLANG_SERVICE_PREFIX: &str = "sglang:";
 
 #[derive(Debug, Clone)]
@@ -81,6 +83,14 @@ impl InferenceService {
         }
     }
 
+    #[must_use]
+    pub const fn get_usage(&self) -> &'static str {
+        match self {
+            Self::Vllm => VLLM_MEMORY_USAGE_QUERY,
+            Self::SgLang => SGLANG_MEMORY_USAGE_QUERY,
+        }
+    }
+
     #[must_use]
     pub const fn get_service_prefix(&self) -> &'static str {
         match self {
@@ -611,8 +621,8 @@ pub mod inference_service_metrics {
     });
 
     /// Chat completions metrics
-    #[derive(Debug, Clone)]
-    struct ChatCompletionsMetrics {
+    #[derive(Debug, Clone, PartialEq)]
+    pub struct ChatCompletionsMetrics {
         /// The model name  
         model: String,
         /// The chat completions service url
@@ -621,10 +631,47 @@ pub mod inference_service_metrics {
         num_queued_requests: f64,
         /// The number of running requests
         num_running_requests: f64,
+        /// The memory usage in fraction, e.g. 1.00 means 100% memory usage
+        memory_usage: f64,
         /// The maximum number of running requests allowed for this url.
         max_number_of_running_requests: usize,
     }
 
+    impl Eq for ChatCompletionsMetrics {}
+
+    impl PartialOrd for ChatCompletionsMetrics {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    impl Ord for ChatCompletionsMetrics {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            self.num_queued_requests
+                .total_cmp(&other.num_queued_requests)
+                .then_with(|| {
+                    self.memory_usage
+                        .total_cmp(&other.memory_usage)
+                        .then_with(|| {
+                            self.num_running_requests
+                                .total_cmp(&other.num_running_requests)
+                        })
+                })
+        }
+    }
+
+    impl ChatCompletionsMetrics {
+        #[must_use]
+        pub fn above_upper_threshold_exceeded(&self, threshold: f64) -> bool {
+            self.memory_usage > threshold
+        }
+
+        #[must_use]
+        pub fn under_lower_threshold(&self, threshold: f64) -> bool {
+            self.memory_usage <= threshold
+        }
+    }
+
     /// Cache structure to store metrics
     #[derive(Debug, Default)]
     struct MetricsCache {
@@ -787,12 +834,18 @@ pub mod inference_service_metrics {
                         inference_service.get_running_requests_metric_name(),
                         job,
                     )?;
+                    let memory_usage = extract_metric(
+                        &metrics,
+                        inference_service.get_usage(),
+                        job,
+                    )?;
 
                     Ok(ChatCompletionsMetrics {
                         model: model.clone(),
                         chat_completions_service_url: chat_completions_service_url.clone(),
                         num_queued_requests,
                         num_running_requests,
+                        memory_usage,
                         max_number_of_running_requests: *max_number_of_running_requests,
                     })
                 });
@@ -838,71 +891,37 @@ pub mod inference_service_metrics {
             })
     }
 
-    /// Selects the best available chat completions service URL for a given model based on performance metrics.
-    ///
-    /// This function aims to distribute load and ensure optimal response times by choosing
-    /// the service instance that is currently performing best. The selection process prioritizes
-    /// services with lower requests running and queue lengths.
-    ///
-    /// # Metrics and Selection Logic:
+    /// Retrieves all chat completions metrics for the specified model.
     ///
-    /// 1.  **Metrics Source**: Metrics for each service (vLLM or SgLang) are retrieved directly from the inference
-    ///     service URL.
-    ///
-    /// 2.  **Priority of Metrics for "Best" Service Selection**:
-    ///     *   **No Load**: If a service has zero running requests (`num_running_requests` is 0.0),
-    ///         it's considered the best.
-    ///     *   **Number of Queued Requests**: If number of running requests are equivalent, the service
-    ///         with the fewest `num_queue_requests` is selected.
-    ///
-    /// 3.  **Handling Missing or Invalid Metrics**:
-    ///     *   If, after checking all services, no valid metrics are found for the specified `model`,
-    ///         a service URL is chosen randomly from the initial list.
-    ///
-    /// # Load Thresholds and Behavior:
-    ///
-    /// The function defines several thresholds to manage high load scenarios:
-    /// *   `MAX_ALLOWED_NUM_QUEUED_REQUESTS` (1.0)
-    ///
-    /// If the determined "best" service (or all services) exceeds these
-    /// thresholds, the function returns the first URL from the input `chat_completions_service_urls`
-    /// list along with a `StatusCode::TOO_MANY_REQUESTS`. The `CHAT_COMPLETIONS_TOO_MANY_REQUESTS`
-    /// metric counter is also incremented for the model.
+    /// This function fetches metrics from both vLLM and SgLang services,
+    /// partitions the service URLs based on the job type, and retrieves metrics
+    /// for each service. It returns a vector of `ChatCompletionsMetrics` for the specified model.
     ///
     /// # Arguments
     ///
-    /// * `chat_completions_service_urls`: A slice of tuples `(String, String)`, where each tuple
-    ///   represents a service. The first `String` is the service URL, and the second `String`
-    ///   is the job name (e.g., "vllm-service", "sglang-service"), used to determine the
-    ///   metrics querying strategy.
-    /// * `model`: A string slice representing the name of the model for which the best service
-    ///   URL is being requested. The comparison is case-insensitive.
+    /// * `chat_completions_service_urls` - A vector of tuples containing the chat completions service URLs,
+    ///   job names, and maximum concurrent requests.
+    /// * `model` - The model name for which to retrieve metrics.
     ///
     /// # Returns
     ///
-    /// Returns a `Result<(String, StatusCode), ChatCompletionsMetricsError>`:
-    /// *   `Ok((String, StatusCode::OK))`: On success, containing the URL of the best available
-    ///     service and an OK status.
-    /// *   `Ok((String, StatusCode::TOO_MANY_REQUESTS))`: If the system is determined to be under
-    ///     high load based on the metrics thresholds. The returned `String` will be the first URL
-    ///     from the `chat_completions_service_urls` input.
-    /// *   `Err(ChatCompletionsMetricsError)`: If an error occurs, such as no service URLs
-    ///     being provided or issues during metrics fetching that are not handled by fallback mechanisms.
+    /// Returns a `Result<Vec<ChatCompletionsMetrics>>` containing the metrics for the specified model.
     ///
     /// # Errors
     ///
-    /// *   `ChatCompletionsMetricsError::NoChatCompletionsServiceUrlsFound`: If the input
-    ///     `chat_completions_service_urls` slice is empty.
-    /// *   Other variants of `ChatCompletionsMetricsError` may be returned if underlying issues
-    ///     occur during metric collection from Prometheus (e.g., network errors, parsing errors),
-    ///     though the function attempts to handle missing individual metrics gracefully.g
-    #[instrument(level = "info", skip_all, fields(model=model))]
-    #[allow(clippy::float_cmp)]
-    pub async fn get_best_available_chat_completions_service_url(
-        running_num_requests: &RequestCounter,
+    /// *   `ChatCompletionsMetricsError::NoChatCompletionsServiceUrlsFound`: If no chat completions service URLs are provided.
+    /// *   Other variants of `ChatCompletionsMetricsError` may be returned if underlying
+    ///     issues occur during metric collection from Prometheus (e.g., network errors,
+    ///     parsing errors), though the function attempts to handle missing individual metrics gracefully.
+    #[instrument(
+        level = "info",
+        skip(chat_completions_service_urls, model),
+        fields(model = model)
+    )]
+    pub async fn get_all_metrics(
         chat_completions_service_urls: &[(String, String, usize)], // (url, job, max_concurrent_requests)
         model: &str,
-    ) -> Result<(String, StatusCode)> {
+    ) -> Result<Vec<ChatCompletionsMetrics>> {
         type ChatCompletionsServiceUrls = Vec<(String, String, usize)>;
 
         if chat_completions_service_urls.is_empty() {
@@ -1019,6 +1038,7 @@ pub mod inference_service_metrics {
                     chat_completions_service_url,
                     num_queued_requests,
                     num_running_requests,
+                    memory_usage,
                     max_number_of_running_requests,
                 }) => {
                     tracing::info!(
@@ -1045,6 +1065,7 @@ pub mod inference_service_metrics {
                         chat_completions_service_url,
                         num_queued_requests,
                         num_running_requests,
+                        memory_usage,
                         max_number_of_running_requests,
                     });
                 }
@@ -1058,7 +1079,85 @@ pub mod inference_service_metrics {
                 }
             }
         }
+        Ok(metrics_results)
+    }
 
+    /// Selects the best available chat completions service URL for a given model based on performance metrics.
+    ///
+    /// This function aims to distribute load and ensure optimal response times by choosing
+    /// the service instance that is currently performing best. The selection process prioritizes
+    /// services with lower requests running and queue lengths.
+    ///
+    /// # Metrics and Selection Logic:
+    ///
+    /// 1.  **Metrics Source**: Metrics for each service (vLLM or SgLang) are retrieved directly from the inference
+    ///     service URL.
+    ///
+    /// 2.  **Priority of Metrics for "Best" Service Selection**:
+    ///     *   **No Load**: If a service has zero running requests (`num_running_requests` is 0.0),
+    ///         it's considered the best.
+    ///     *   **Number of Queued Requests**: If number of running requests are equivalent, the service
+    ///         with the fewest `num_queue_requests` is selected.
+    ///
+    /// 3.  **Handling Missing or Invalid Metrics**:
+    ///     *   If, after checking all services, no valid metrics are found for the specified `model`,
+    ///         a service URL is chosen randomly from the initial list.
+    ///
+    /// # Load Thresholds and Behavior:
+    ///
+    /// The function defines several thresholds to manage high load scenarios:
+    /// *   `MAX_ALLOWED_NUM_QUEUED_REQUESTS` (1.0)
+    ///
+    /// If the determined "best" service (or all services) exceeds these
+    /// thresholds, the function returns the first URL from the input `chat_completions_service_urls`
+    /// list along with a `StatusCode::TOO_MANY_REQUESTS`. The `CHAT_COMPLETIONS_TOO_MANY_REQUESTS`
+    /// metric counter is also incremented for the model.
+    ///
+    /// # Arguments
+    ///
+    /// * `chat_completions_service_urls`: A slice of tuples `(String, String)`, where each tuple
+    ///   represents a service. The first `String` is the service URL, and the second `String`
+    ///   is the job name (e.g., "vllm-service", "sglang-service"), used to determine the
+    ///   metrics querying strategy.
+    /// * `model`: A string slice representing the name of the model for which the best service
+    ///   URL is being requested. The comparison is case-insensitive.
+    ///
+    /// # Returns
+    ///
+    /// Returns a `Result<(String, StatusCode), ChatCompletionsMetricsError>`:
+    /// *   `Ok((String, StatusCode::OK))`: On success, containing the URL of the best available
+    ///     service and an OK status.
+    /// *   `Ok((String, StatusCode::TOO_MANY_REQUESTS))`: If the system is determined to be under
+    ///     high load based on the metrics thresholds. The returned `String` will be the first URL
+    ///     from the `chat_completions_service_urls` input.
+    /// *   `Err(ChatCompletionsMetricsError)`: If an error occurs, such as no service URLs
+    ///     being provided or issues during metrics fetching that are not handled by fallback mechanisms.
+    ///
+    /// # Errors
+    ///
+    /// *   `ChatCompletionsMetricsError::NoChatCompletionsServiceUrlsFound`: If the input
+    ///     `chat_completions_service_urls` slice is empty.
+    /// *   Other variants of `ChatCompletionsMetricsError` may be returned if underlying issues
+    ///     occur during metric collection from Prometheus (e.g., network errors, parsing errors),
+    ///     though the function attempts to handle missing individual metrics gracefully.g
+    #[instrument(level = "info", skip_all, fields(model=model))]
+    #[allow(clippy::float_cmp)]
+    pub async fn get_best_available_chat_completions_service_url(
+        running_num_requests: &RequestCounter,
+        chat_completions_service_urls: &[(String, String, usize)], // (url, job, max_concurrent_requests)
+        model: &str,
+        memory_upper_threshold: f64,
+    ) -> Result<(String, StatusCode)> {
+        let mut metrics_results = get_all_metrics(chat_completions_service_urls, model)
+            .await
+            .map_err(|e| {
+                tracing::error!(
+                    target = "atoma-service",
+                    level = "error",
+                    "Failed to get metrics for model: {model} with error: {e}"
+                );
+                e
+            })?;
         if metrics_results.is_empty() {
             tracing::warn!(
                 target = "atoma-service",
@@ -1072,18 +1171,22 @@ pub mod inference_service_metrics {
         }
 
         // Select the best available chat completions service URL based on the number of queued and running requests.
-        metrics_results.sort_by_key(|metric| {
-            (
-                metric.num_queued_requests as i64,
-                metric.num_running_requests as i64,
-            )
-        });
+        metrics_results.sort();
 
         for metric in metrics_results {
             if running_num_requests.increment(
                 &metric.chat_completions_service_url,
                 metric.max_number_of_running_requests,
             ) {
+                if metric.above_upper_threshold_exceeded(memory_upper_threshold) {
+                    tracing::debug!(
+                        target = "atoma-service",
+                        level = "debug",
+                        "Memory usage for model: {model} is too high: {}",
+                        metric.memory_usage
+                    );
+                    continue;
+                }
                 let best_url = metric.chat_completions_service_url.clone();
                 tracing::info!(
                     target = "atoma-service",
diff --git a/atoma-service/src/middleware.rs b/atoma-service/src/middleware.rs
index 04fd20fc..701b986a 100644
--- a/atoma-service/src/middleware.rs
+++ b/atoma-service/src/middleware.rs
@@ -811,7 +811,7 @@ pub async fn verify_permissions(
             message: "Model is not a string".to_string(),
             endpoint: endpoint.clone(),
         })?;
-    utils::check_if_too_many_requests(&state, model, &endpoint)?;
+    utils::check_if_too_many_requests(&state, model, &endpoint).await?;
     if !state.models.contains(&model.to_string()) {
         return Err(AtomaServiceError::InvalidBody {
             message: format!("Model not supported, supported models: {:?}", state.models),
@@ -1016,6 +1016,7 @@ pub mod utils {
         completions::RequestModelCompletions,
         embeddings::RequestModelEmbeddings,
         image_generations::RequestModelImageGenerations,
+        inference_service_metrics::get_all_metrics,
         request_model::{RequestModel, TokensEstimate},
     };
 
@@ -1607,36 +1608,48 @@ pub mod utils {
     /// - The model has too many requests
     /// - The elapsed time since the first occurrence is less than the timeout
     #[instrument(level = "info", skip_all, err)]
-    pub fn check_if_too_many_requests(
+    pub async fn check_if_too_many_requests(
         state: &AppState,
         model: &str,
         endpoint: &str,
     ) -> Result<(), AtomaServiceError> {
-        match state.too_many_requests.entry(model.to_string()) {
-            dashmap::mapref::entry::Entry::Occupied(occupied_entry) => {
-                let elapsed_ms = occupied_entry.get().elapsed().as_millis();
-
-                if elapsed_ms < state.too_many_requests_timeout_ms {
-                    tracing::info!(
-                            target = "atoma-service",
-                            level = "info",
-                            "Too many requests for model: {model}, endpoint: {endpoint}, elapsed trigger time: {elapsed_ms} and timeout: {}",
-                            state.too_many_requests_timeout_ms
-                        );
-                    return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
-                        message: "Too many requests".to_string(),
+        if state.too_many_requests.get(model).is_some() {
+            let chat_completions_service_urls = state
+                .chat_completions_service_urls
+                .get(&model.to_lowercase())
+                .ok_or_else(|| {
+                    AtomaServiceError::InternalError {
+                        message: format!(
+                            "Chat completions service URL not found, likely that model is not supported by the current node: {}",
+                            model
+                        ),
                         endpoint: endpoint.to_string(),
-                    });
-                }
-                occupied_entry.remove();
-            }
-            dashmap::mapref::entry::Entry::Vacant(_) => {
+                    }
+                })?;
+            let metrics = get_all_metrics(chat_completions_service_urls, model)
+                .await
+                .map_err(|e| AtomaServiceError::InternalError {
+                    message: format!("Failed to get metrics for model {model}, with error: {e}"),
+                    endpoint: endpoint.to_string(),
+                })?;
+            if metrics
+                .iter()
+                .any(|metric| metric.under_lower_threshold(state.memory_lower_threshold))
+            {
+                state.too_many_requests.remove(model);
                 tracing::debug!(
                     target = "atoma-service",
                     level = "debug",
-                    "Model is not in the `too_many_requests` map, so no action is needed here. Processing can continue."
+                    "Model {} is in the `too_many_requests` map, but metrics indicate that it is no longer exceeding the lower threshold. Removing from the map.",
+                    model
                 );
             }
+        } else {
+            tracing::debug!(
+                    target = "atoma-service",
+                    level = "debug",
+                    "Model is not in the `too_many_requests` map, so no action is needed here. Processing can continue."
+                );
         }
         Ok(())
     }
diff --git a/atoma-service/src/server.rs b/atoma-service/src/server.rs
index 125a9d9e..7e132116 100644
--- a/atoma-service/src/server.rs
+++ b/atoma-service/src/server.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashMap, sync::Arc, time::Instant};
+use std::{collections::HashMap, sync::Arc};
 
 use atoma_confidential::types::{
     ConfidentialComputeDecryptionRequest, ConfidentialComputeDecryptionResponse,
@@ -207,13 +207,21 @@ pub struct AppState {
     pub whitelist_sui_addresses_for_fiat: Vec<String>,
 
     /// When was the too many requests triggered for each model.
-    pub too_many_requests: Arc<DashMap<String, Instant>>,
+    pub too_many_requests: Arc<DashSet<String>>,
 
     /// The time for which we triiger too many requests since the first occurrence.
     pub too_many_requests_timeout_ms: u128,
 
     /// Number of running requests for each inference service.
     pub running_num_requests: Arc<RequestCounter>,
+
+    /// The upper memory threshold for the node.
+    /// This threshold is used to determine when the node should start rejecting.
+    pub memory_upper_threshold: f64,
+
+    /// The lower memory threshold for the node.
+    /// This threshold is used to determine when the node can start accepting requests again.
+    pub memory_lower_threshold: f64,
 }
 
 /// Creates and configures the main router for the application.
diff --git a/atoma-service/src/tests.rs b/atoma-service/src/tests.rs
index 9410bfca..b601a696 100644
--- a/atoma-service/src/tests.rs
+++ b/atoma-service/src/tests.rs
@@ -341,9 +341,11 @@ mod middleware {
                 address_index: 0,
                 stack_retrieve_sender,
                 whitelist_sui_addresses_for_fiat: vec![],
-                too_many_requests: Arc::new(DashMap::new()),
+                too_many_requests: Arc::new(DashSet::new()),
                 too_many_requests_timeout_ms: 0,
                 running_num_requests: Arc::new(RequestCounter::new()),
+                memory_lower_threshold: 1.0,
+                memory_upper_threshold: 1.0,
             },
             public_key,
             signature,
diff --git a/config.example.toml b/config.example.toml
index 3aa7c708..1a519842 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -46,6 +46,8 @@ image_generations_service_url = "http://image-generations:80"
 # List of models to be used by the service, the current value here is just a placeholder, please change it to the models you want to deploy
 environment                      = "development"                                       # or "production" (for use in sentry, you need to set the Sentry DSN)
 heartbeat_url                    = "my-heartbeat-url"
+memory_lower_threshold           = 0.75                                                # Lower threshold for memory usage, if the memory usage goes below this value, the service will not be considered overloaded
+memory_upper_threshold           = 0.9                                                 # Upper threshold for memory usage, if the memory usage goes above this value, the service will be considered overloaded
 metrics_update_interval          = 30
 models                           = [ "Infermatic/Llama-3.3-70B-Instruct-FP8-Dynamic" ]
 revisions                        = [ "main" ]

From f34694baa83a92a6deb7f07fb90fa1344b46dc19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= <matroid@outlook.com>
Date: Tue, 3 Jun 2025 17:01:49 +0100
Subject: [PATCH 10/13] feat: update sui dependencies (#654)

* resolve compilation issues

* ci: add caching strategy for ci

* ci: optimize coverage job

* ci: adjust coverage job

* ci: update deny action

* ci: use grcov

* ci: use stable toolchain

* ci: only run tests once

* ci: move coverage to test file

* ci: use --codecov flag & stable toolchain

* ci: discard p2p tester

---------

Co-authored-by: chad <chad.nehemiah94@gmail.com>
---
 .github/workflows/coverage.yml |   26 +-
 .github/workflows/deny.yml     |    3 +-
 Cargo.lock                     | 1191 ++++++++++++++++++++++----------
 Cargo.toml                     |    4 +-
 atoma-bin/atoma_node.rs        |   12 +-
 atoma-sui/src/client.rs        |   12 +-
 6 files changed, 870 insertions(+), 378 deletions(-)

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 089ba8f3..0e8d5609 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -1,14 +1,7 @@
 name: Coverage
 
-on: [push, pull_request]
-
-env:
-  toolchain: stable
-  CARGO_HTTP_MULTIPLEXING: false
-  CARGO_TERM_COLOR: always
-  CARGO_UNSTABLE_SPARSE_REGISTRY: true
-  CARGO_INCREMENTAL: 0
-  TERM: unknown
+on:
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -17,6 +10,9 @@ concurrency:
 jobs:
   coverage:
     runs-on: ubuntu-22.04
+    env:
+      toolchain: stable
+      CARGO_TERM_COLOR: always
     services:
       postgres:
         image: postgres:13
@@ -33,6 +29,11 @@ jobs:
           --health-retries 5
     steps:
       - uses: actions/checkout@v4
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          toolchain: ${{ env.toolchain }}
+          components: clippy
 
       - name: Install system dependencies
         run: |
@@ -40,13 +41,14 @@ jobs:
           sudo apt-get install -y pkg-config libtss2-dev
 
       - name: Install cargo-llvm-cov
-        run: cargo install cargo-llvm-cov
+        uses: taiki-e/install-action@cargo-llvm-cov
 
       - name: Generate coverage
-        run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
+        run: cargo llvm-cov --all-features --workspace --exclude atoma-p2p-tester --codecov --output-path codecov.json
 
       - name: Upload to Codecov
         uses: codecov/codecov-action@v5
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
-          files: lcov.info
+          files: codecov.json
+          fail_ci_if_error: true
diff --git a/.github/workflows/deny.yml b/.github/workflows/deny.yml
index 5c716018..8709a570 100644
--- a/.github/workflows/deny.yml
+++ b/.github/workflows/deny.yml
@@ -1,5 +1,6 @@
 name: CI
-on: [push, pull_request]
+on:
+  pull_request:
 jobs:
   cargo-deny:
     runs-on: ubuntu-22.04
diff --git a/Cargo.lock b/Cargo.lock
index 4e7a5495..59da0862 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,16 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "Inflector"
+version = "0.11.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3"
+dependencies = [
+ "lazy_static",
+ "regex",
+]
+
 [[package]]
 name = "addchain"
 version = "0.2.0"
@@ -105,6 +115,30 @@ dependencies = [
  "alloc-no-stdlib",
 ]
 
+[[package]]
+name = "allocative"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fac2ce611db8b8cee9b2aa886ca03c924e9da5e5295d0dbd0526e5d0b0710f7"
+dependencies = [
+ "allocative_derive",
+ "bumpalo",
+ "ctor",
+ "hashbrown 0.14.5",
+ "num-bigint 0.4.6",
+]
+
+[[package]]
+name = "allocative_derive"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe233a377643e0fc1a56421d7c90acdec45c291b30345eb9f08e8d0ddce5a4ab"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.101",
+]
+
 [[package]]
 name = "allocator-api2"
 version = "0.2.21"
@@ -179,6 +213,15 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "annotate-snippets"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccaf7e9dfbb6ab22c82e473cd1a8a7bd313c19a5b7e40970f3d89ef5a5c9e81e"
+dependencies = [
+ "unicode-width 0.1.14",
+]
+
 [[package]]
 name = "anstream"
 version = "0.6.18"
@@ -220,12 +263,12 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.7"
+version = "3.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e"
+checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa"
 dependencies = [
  "anstyle",
- "once_cell",
+ "once_cell_polyfill",
  "windows-sys 0.59.0",
 ]
 
@@ -263,15 +306,6 @@ dependencies = [
  "derive_arbitrary",
 ]
 
-[[package]]
-name = "arc-swap"
-version = "1.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
-dependencies = [
- "serde",
-]
-
 [[package]]
 name = "ark-bn254"
 version = "0.4.0"
@@ -474,6 +508,15 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
+[[package]]
+name = "ascii-canvas"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8824ecca2e851cec16968d54a01dd372ef8f95b244fb84b84e70128be347c3c6"
+dependencies = [
+ "term",
+]
+
 [[package]]
 name = "asn1-rs"
 version = "0.6.2"
@@ -577,9 +620,9 @@ dependencies = [
 
 [[package]]
 name = "async-io"
-version = "2.4.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43a2b323ccce0a1d90b449fd71f2a06ca7faa7c54c2751f06c9bd851fc061059"
+checksum = "1237c0ae75a0f3765f58910ff9cdd0a12eeb39ab2f4c7de23262f337f0aacbb3"
 dependencies = [
  "async-lock",
  "cfg-if",
@@ -588,7 +631,7 @@ dependencies = [
  "futures-lite",
  "parking",
  "polling",
- "rustix 0.38.44",
+ "rustix",
  "slab",
  "tracing",
  "windows-sys 0.59.0",
@@ -824,7 +867,7 @@ dependencies = [
  "tokenizers",
  "tokio",
  "tower 0.5.2",
- "tower-http 0.6.4",
+ "tower-http 0.6.5",
  "tracing",
  "tracing-appender",
  "tracing-opentelemetry",
@@ -1041,29 +1084,6 @@ dependencies = [
  "syn 2.0.101",
 ]
 
-[[package]]
-name = "axum-server"
-version = "0.6.1"
-source = "git+https://github.com/bmwill/axum-server.git?rev=f44323e271afdd1365fd0c8b0a4c0bbdf4956cb7#f44323e271afdd1365fd0c8b0a4c0bbdf4956cb7"
-dependencies = [
- "arc-swap",
- "bytes",
- "futures-util",
- "http 1.3.1",
- "http-body",
- "http-body-util",
- "hyper",
- "hyper-util",
- "pin-project-lite",
- "rustls",
- "rustls-pemfile",
- "rustls-pki-types",
- "tokio",
- "tokio-rustls",
- "tower 0.4.13",
- "tower-service",
-]
-
 [[package]]
 name = "backtrace"
 version = "0.3.75"
@@ -1137,6 +1157,12 @@ version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d86b93f97252c47b41663388e6d155714a9d0c398b99f1005cbc5f978b29f445"
 
+[[package]]
+name = "beef"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1"
+
 [[package]]
 name = "bellpepper"
 version = "0.4.1"
@@ -1208,15 +1234,30 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec 0.6.3",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
 dependencies = [
- "bit-vec",
+ "bit-vec 0.8.0",
 ]
 
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
 [[package]]
 name = "bit-vec"
 version = "0.8.0"
@@ -1246,9 +1287,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.9.0"
+version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd"
+checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
 dependencies = [
  "serde",
 ]
@@ -1480,9 +1521,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.22"
+version = "1.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1"
+checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951"
 dependencies = [
  "jobserver",
  "libc",
@@ -1501,6 +1542,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "cfg_aliases"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
+
 [[package]]
 name = "cfg_aliases"
 version = "0.2.1"
@@ -1586,9 +1633,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.38"
+version = "4.5.39"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000"
+checksum = "fd60e63e9be68e5fb56422e397cf9baddded06dae1d2e523401542383bc72a9f"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -1596,9 +1643,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.38"
+version = "4.5.39"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120"
+checksum = "89cc6392a1f72bbeb820d71f32108f61fdaf18bc526e1d23954168a67759ef51"
 dependencies = [
  "anstream",
  "anstyle",
@@ -1625,6 +1672,21 @@ version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
 
+[[package]]
+name = "clipboard-win"
+version = "5.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15efe7a882b08f34e38556b14f2fb3daa98769d06c7f0c1b076dfd0d983bc892"
+dependencies = [
+ "error-code",
+]
+
+[[package]]
+name = "cmp_any"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9b18233253483ce2f65329a24072ec414db782531bdbb7d0bbc4bd2ce6b7e21"
+
 [[package]]
 name = "codespan"
 version = "0.11.1"
@@ -1703,7 +1765,7 @@ dependencies = [
 [[package]]
 name = "consensus-config"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "fastcrypto 0.1.8",
  "mysten-network",
@@ -1784,9 +1846,9 @@ dependencies = [
 
 [[package]]
 name = "core-foundation"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63"
+checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -1955,6 +2017,16 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "ctor"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "ctr"
 version = "0.9.2"
@@ -2137,6 +2209,17 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "debugserver-types"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bf6834a70ed14e8e4e41882df27190bea150f1f6ecf461f1033f8739cd8af4a"
+dependencies = [
+ "schemafy",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "der"
 version = "0.6.1"
@@ -2289,12 +2372,19 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
 dependencies = [
+ "convert_case 0.6.0",
  "proc-macro2",
  "quote",
  "syn 2.0.101",
  "unicode-xid",
 ]
 
+[[package]]
+name = "diff"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
+
 [[package]]
 name = "digest"
 version = "0.9.0"
@@ -2378,6 +2468,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "display_container"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a110a75c96bedec8e65823dea00a1d710288b7a369d95fd8a0f5127639466fa"
+dependencies = [
+ "either",
+ "indenter",
+]
+
 [[package]]
 name = "displaydoc"
 version = "0.2.5"
@@ -2416,6 +2516,26 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
 
+[[package]]
+name = "dupe"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ed2bc011db9c93fbc2b6cdb341a53737a55bafb46dbb74cf6764fc33a2fbf9c"
+dependencies = [
+ "dupe_derive",
+]
+
+[[package]]
+name = "dupe_derive"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83e195b4945e88836d826124af44fdcb262ec01ef94d44f14f4fb5103f19892a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.101",
+]
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.19"
@@ -2537,6 +2657,15 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "ena"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d248bdd43ce613d87415282f69b9bb99d947d290b10962dd6c56233312c2ad5"
+dependencies = [
+ "log",
+]
+
 [[package]]
 name = "encode_unicode"
 version = "1.0.0"
@@ -2552,6 +2681,12 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "endian-type"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
+
 [[package]]
 name = "enum-as-inner"
 version = "0.6.1"
@@ -2567,7 +2702,7 @@ dependencies = [
 [[package]]
 name = "enum-compat-util"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "serde_yaml 0.8.26",
 ]
@@ -2599,16 +2734,31 @@ dependencies = [
  "typeid",
 ]
 
+[[package]]
+name = "erased-serde"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c138974f9d5e7fe373eb04df7cae98833802ae4b11c24ac7039a21d5af4b26c"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "errno"
-version = "0.3.11"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "976dd42dc7e85965fe702eb8164f21f450704bdde31faefd6471dba214cb594e"
+checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18"
 dependencies = [
  "libc",
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "error-code"
+version = "3.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dea2df4cf52843e0452895c455a1a2cfbb842a1e7329671acf418fdc53ed4c59"
+
 [[package]]
 name = "esaxx-rs"
 version = "0.1.10"
@@ -2631,9 +2781,9 @@ dependencies = [
 
 [[package]]
 name = "ethnum"
-version = "1.5.1"
+version = "1.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0939f82868b77ef93ce3c3c3daf2b3c526b456741da5a1a4559e590965b6026b"
+checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b"
 
 [[package]]
 name = "event-listener"
@@ -2848,6 +2998,17 @@ version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 
+[[package]]
+name = "fd-lock"
+version = "4.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78"
+dependencies = [
+ "cfg-if",
+ "rustix",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "ff"
 version = "0.12.1"
@@ -2922,6 +3083,12 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d"
 
+[[package]]
+name = "fixedbitset"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
+
 [[package]]
 name = "flate2"
 version = "1.1.1"
@@ -3131,17 +3298,27 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "generator"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc6bd114ceda131d3b1d665eba35788690ad37f5916457286b32ab6fd3c438dd"
+checksum = "d18470a76cb7f8ff746cf1f7470914f900252ec36bbc40b569d74b1258446827"
 dependencies = [
+ "cc",
  "cfg-if",
  "libc",
  "log",
  "rustversion",
- "windows 0.58.0",
+ "windows 0.61.1",
 ]
 
 [[package]]
@@ -3365,15 +3542,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
 [[package]]
 name = "hermit-abi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
-
-[[package]]
-name = "hermit-abi"
-version = "0.4.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
+checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08"
 
 [[package]]
 name = "hex"
@@ -3596,8 +3767,8 @@ dependencies = [
 
 [[package]]
 name = "hyper-rustls"
-version = "0.27.5"
-source = "git+https://github.com/rustls/hyper-rustls?branch=main#8f9728a8cb9b5c52b0ef23169dd968e98b8ef42a"
+version = "0.27.6"
+source = "git+https://github.com/rustls/hyper-rustls?branch=main#e6a23710aa02b81ccf03d54801df8faace53eb68"
 dependencies = [
  "http 1.3.1",
  "hyper",
@@ -3609,7 +3780,7 @@ dependencies = [
  "tokio",
  "tokio-rustls",
  "tower-service",
- "webpki-roots 0.26.11",
+ "webpki-roots 1.0.0",
 ]
 
 [[package]]
@@ -3643,22 +3814,28 @@ dependencies = [
 
 [[package]]
 name = "hyper-util"
-version = "0.1.11"
+version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "497bbc33a26fdd4af9ed9c70d63f61cf56a938375fbb32df34db9b1cd6d643f2"
+checksum = "b1c293b6b3d21eca78250dc7dbebd6b9210ec5530e038cbfe0661b5c47ab06e8"
 dependencies = [
+ "base64 0.22.1",
  "bytes",
  "futures-channel",
+ "futures-core",
  "futures-util",
  "http 1.3.1",
  "http-body",
  "hyper",
+ "ipnet",
  "libc",
+ "percent-encoding",
  "pin-project-lite",
  "socket2",
+ "system-configuration",
  "tokio",
  "tower-service",
  "tracing",
+ "windows-registry",
 ]
 
 [[package]]
@@ -3673,7 +3850,7 @@ dependencies = [
  "js-sys",
  "log",
  "wasm-bindgen",
- "windows-core 0.61.0",
+ "windows-core 0.61.2",
 ]
 
 [[package]]
@@ -3734,9 +3911,9 @@ checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3"
 
 [[package]]
 name = "icu_properties"
-version = "2.0.0"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2549ca8c7241c82f59c80ba2a6f415d931c5b58d24fb8412caa1a1f02c49139a"
+checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b"
 dependencies = [
  "displaydoc",
  "icu_collections",
@@ -3750,9 +3927,9 @@ dependencies = [
 
 [[package]]
 name = "icu_properties_data"
-version = "2.0.0"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8197e866e47b68f8f7d95249e172903bec06004b18b2937f1095d40a0c57de04"
+checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632"
 
 [[package]]
 name = "icu_provider"
@@ -3964,6 +4141,15 @@ dependencies = [
  "similar",
 ]
 
+[[package]]
+name = "inventory"
+version = "0.3.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab08d7cd2c5897f2c949e5383ea7c7db03fb19130ffcfbf7eda795137ae3cb83"
+dependencies = [
+ "rustversion",
+]
+
 [[package]]
 name = "ipconfig"
 version = "0.3.2"
@@ -3992,6 +4178,17 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "is-terminal"
+version = "0.4.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.1"
@@ -4313,6 +4510,37 @@ dependencies = [
  "cpufeatures",
 ]
 
+[[package]]
+name = "lalrpop"
+version = "0.19.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a1cbf952127589f2851ab2046af368fd20645491bb4b376f04b7f94d7a9837b"
+dependencies = [
+ "ascii-canvas",
+ "bit-set 0.5.3",
+ "diff",
+ "ena",
+ "is-terminal",
+ "itertools 0.10.5",
+ "lalrpop-util",
+ "petgraph 0.6.5",
+ "regex",
+ "regex-syntax 0.6.29",
+ "string_cache",
+ "term",
+ "tiny-keccak",
+ "unicode-xid",
+]
+
+[[package]]
+name = "lalrpop-util"
+version = "0.19.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3c48237b9604c5a4702de6b824e02006c3214327564636aef27c1028a8fa0ed"
+dependencies = [
+ "regex",
+]
+
 [[package]]
 name = "lazy_static"
 version = "1.5.0"
@@ -4322,6 +4550,15 @@ dependencies = [
  "spin",
 ]
 
+[[package]]
+name = "lcov"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ccfa6d5e585a884db65b37f38184e4364eaf74d884ac35d0a90fe9baf80b723"
+dependencies = [
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "leb128"
 version = "0.2.5"
@@ -4336,9 +4573,9 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
 
 [[package]]
 name = "libloading"
-version = "0.8.7"
+version = "0.8.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a793df0d7afeac54f95b471d3af7f0d4fb975699f972341a4b76988d49cdf0c"
+checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
 dependencies = [
  "cfg-if",
  "windows-targets 0.53.0",
@@ -4769,7 +5006,7 @@ dependencies = [
  "thiserror 2.0.12",
  "tracing",
  "yamux 0.12.1",
- "yamux 0.13.4",
+ "yamux 0.13.5",
 ]
 
 [[package]]
@@ -4778,7 +5015,7 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "libc",
 ]
 
@@ -4809,30 +5046,24 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
 
 [[package]]
 name = "linkme"
-version = "0.3.32"
+version = "0.3.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22d227772b5999ddc0690e733f734f95ca05387e329c4084fe65678c51198ffe"
+checksum = "a1b1703c00b2a6a70738920544aa51652532cacddfec2e162d2e29eae01e665c"
 dependencies = [
  "linkme-impl",
 ]
 
 [[package]]
 name = "linkme-impl"
-version = "0.3.32"
+version = "0.3.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71a98813fa0073a317ed6a8055dcd4722a49d9b862af828ee68449adb799b6be"
+checksum = "04d55ca5d5a14363da83bf3c33874b8feaa34653e760d5216d7ef9829c88001a"
 dependencies = [
  "proc-macro2",
  "quote",
  "syn 2.0.101",
 ]
 
-[[package]]
-name = "linux-raw-sys"
-version = "0.4.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.9.4"
@@ -4847,9 +5078,9 @@ checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
 
 [[package]]
 name = "lock_api"
-version = "0.4.12"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
+checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -4865,13 +5096,36 @@ dependencies = [
 ]
 
 [[package]]
-name = "loki-api"
-version = "0.1.3"
+name = "logos"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdc38a304f59a03e6efa3876766a48c70a766a93f88341c3fff4212834b8e327"
+checksum = "bf8b031682c67a8e3d5446840f9573eb7fe26efe7ec8d195c9ac4c0647c502f1"
 dependencies = [
- "prost",
- "prost-types",
+ "logos-derive",
+]
+
+[[package]]
+name = "logos-derive"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d849148dbaf9661a6151d1ca82b13bb4c4c128146a88d05253b38d4e2f496c"
+dependencies = [
+ "beef",
+ "fnv",
+ "proc-macro2",
+ "quote",
+ "regex-syntax 0.6.29",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "loki-api"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdc38a304f59a03e6efa3876766a48c70a766a93f88341c3fff4212834b8e327"
+dependencies = [
+ "prost",
+ "prost-types",
 ]
 
 [[package]]
@@ -4911,6 +5165,19 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
 
+[[package]]
+name = "lsp-types"
+version = "0.94.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c66bfd44a06ae10647fe3f8214762e9369fd4248df1350924b4ef9e770a85ea1"
+dependencies = [
+ "bitflags 1.3.2",
+ "serde",
+ "serde_json",
+ "serde_repr",
+ "url",
+]
+
 [[package]]
 name = "lsp-types"
 version = "0.95.1"
@@ -4926,9 +5193,9 @@ dependencies = [
 
 [[package]]
 name = "macro_rules_attribute"
-version = "0.2.0"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13"
+checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520"
 dependencies = [
  "macro_rules_attribute-proc_macro",
  "paste",
@@ -4936,9 +5203,15 @@ dependencies = [
 
 [[package]]
 name = "macro_rules_attribute-proc_macro"
-version = "0.2.0"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30"
+
+[[package]]
+name = "maplit"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
+checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
 
 [[package]]
 name = "matchers"
@@ -4983,6 +5256,15 @@ version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
+[[package]]
+name = "memoffset"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -5016,13 +5298,13 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
+checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
 dependencies = [
  "libc",
  "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -5068,7 +5350,7 @@ dependencies = [
 [[package]]
 name = "move-abstract-interpreter"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "move-binary-format",
  "move-bytecode-verifier-meter",
@@ -5077,12 +5359,12 @@ dependencies = [
 [[package]]
 name = "move-abstract-stack"
 version = "0.0.1"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 
 [[package]]
 name = "move-binary-format"
 version = "0.0.3"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "enum-compat-util",
@@ -5097,12 +5379,12 @@ dependencies = [
 [[package]]
 name = "move-borrow-graph"
 version = "0.0.1"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 
 [[package]]
 name = "move-bytecode-source-map"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bcs",
@@ -5118,20 +5400,20 @@ dependencies = [
 [[package]]
 name = "move-bytecode-utils"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "indexmap 2.9.0",
  "move-binary-format",
  "move-core-types",
- "petgraph",
+ "petgraph 0.5.1",
  "serde-reflection",
 ]
 
 [[package]]
 name = "move-bytecode-verifier"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "move-abstract-interpreter",
  "move-abstract-stack",
@@ -5140,13 +5422,13 @@ dependencies = [
  "move-bytecode-verifier-meter",
  "move-core-types",
  "move-vm-config",
- "petgraph",
+ "petgraph 0.5.1",
 ]
 
 [[package]]
 name = "move-bytecode-verifier-meter"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "move-binary-format",
  "move-core-types",
@@ -5156,7 +5438,7 @@ dependencies = [
 [[package]]
 name = "move-command-line-common"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bcs",
@@ -5177,7 +5459,7 @@ dependencies = [
 [[package]]
 name = "move-compiler"
 version = "0.0.1"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bcs",
@@ -5186,7 +5468,7 @@ dependencies = [
  "dunce",
  "hex",
  "insta",
- "lsp-types",
+ "lsp-types 0.95.1",
  "move-binary-format",
  "move-borrow-graph",
  "move-bytecode-source-map",
@@ -5198,7 +5480,7 @@ dependencies = [
  "move-proc-macros",
  "move-symbol-pool",
  "once_cell",
- "petgraph",
+ "petgraph 0.5.1",
  "rayon",
  "regex",
  "serde",
@@ -5212,7 +5494,7 @@ dependencies = [
 [[package]]
 name = "move-core-types"
 version = "0.0.4"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bcs",
@@ -5236,7 +5518,7 @@ dependencies = [
 [[package]]
 name = "move-coverage"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bcs",
@@ -5244,20 +5526,23 @@ dependencies = [
  "codespan",
  "colored",
  "indexmap 2.9.0",
+ "lcov",
  "move-abstract-interpreter",
  "move-binary-format",
  "move-bytecode-source-map",
  "move-command-line-common",
+ "move-compiler",
  "move-core-types",
  "move-ir-types",
- "petgraph",
+ "move-trace-format",
+ "petgraph 0.5.1",
  "serde",
 ]
 
 [[package]]
 name = "move-disassembler"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bcs",
@@ -5278,7 +5563,7 @@ dependencies = [
 [[package]]
 name = "move-ir-to-bytecode"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "codespan-reporting",
@@ -5296,7 +5581,7 @@ dependencies = [
 [[package]]
 name = "move-ir-to-bytecode-syntax"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "hex",
@@ -5309,7 +5594,7 @@ dependencies = [
 [[package]]
 name = "move-ir-types"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "hex",
  "move-command-line-common",
@@ -5322,7 +5607,7 @@ dependencies = [
 [[package]]
 name = "move-proc-macros"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "enum-compat-util",
  "quote",
@@ -5332,17 +5617,29 @@ dependencies = [
 [[package]]
 name = "move-symbol-pool"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "once_cell",
  "phf",
  "serde",
 ]
 
+[[package]]
+name = "move-trace-format"
+version = "0.0.1"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
+dependencies = [
+ "move-binary-format",
+ "move-core-types",
+ "serde",
+ "serde_json",
+ "zstd",
+]
+
 [[package]]
 name = "move-vm-config"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "move-binary-format",
  "once_cell",
@@ -5351,7 +5648,7 @@ dependencies = [
 [[package]]
 name = "move-vm-profiler"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "move-vm-config",
  "once_cell",
@@ -5363,7 +5660,7 @@ dependencies = [
 [[package]]
 name = "move-vm-test-utils"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "move-binary-format",
@@ -5377,7 +5674,7 @@ dependencies = [
 [[package]]
 name = "move-vm-types"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "bcs",
  "move-binary-format",
@@ -5499,7 +5796,7 @@ dependencies = [
 [[package]]
 name = "mysten-common"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "antithesis_sdk",
  "anyhow",
@@ -5509,13 +5806,10 @@ dependencies = [
  "mysten-metrics",
  "once_cell",
  "parking_lot",
- "prometheus",
  "rand 0.8.5",
  "reqwest",
  "snap",
  "sui-macros",
- "sui-tls",
- "sui-types",
  "tempfile",
  "tokio",
  "tracing",
@@ -5524,7 +5818,7 @@ dependencies = [
 [[package]]
 name = "mysten-metrics"
 version = "0.7.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "async-trait",
  "axum 0.8.4",
@@ -5545,7 +5839,7 @@ dependencies = [
 [[package]]
 name = "mysten-network"
 version = "0.2.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anemo",
  "anemo-tower",
@@ -5684,6 +5978,21 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "new_debug_unreachable"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
+
+[[package]]
+name = "nibble_vec"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43"
+dependencies = [
+ "smallvec",
+]
+
 [[package]]
 name = "nix"
 version = "0.26.4"
@@ -5695,6 +6004,18 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "nix"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
+dependencies = [
+ "bitflags 2.9.1",
+ "cfg-if",
+ "cfg_aliases 0.1.1",
+ "libc",
+]
+
 [[package]]
 name = "no-std-compat"
 version = "0.4.1"
@@ -5867,11 +6188,11 @@ dependencies = [
 
 [[package]]
 name = "num_cpus"
-version = "1.16.0"
+version = "1.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
+checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
 dependencies = [
- "hermit-abi 0.3.9",
+ "hermit-abi",
  "libc",
 ]
 
@@ -5916,7 +6237,7 @@ name = "nvml-wrapper"
 version = "0.10.0"
 source = "git+https://github.com/atoma-network/nvml-wrapper.git?branch=main#0d416436404473bc11795dacc1c0c5a995d9aa09"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "libloading",
  "nvml-wrapper-sys",
  "static_assertions",
@@ -5997,13 +6318,19 @@ version = "1.21.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
 
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
+
 [[package]]
 name = "onig"
-version = "6.4.0"
+version = "6.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
+checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.9.1",
  "libc",
  "once_cell",
  "onig_sys",
@@ -6011,9 +6338,9 @@ dependencies = [
 
 [[package]]
 name = "onig_sys"
-version = "69.8.1"
+version = "69.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7"
+checksum = "c7f86c6eef3d6df15f23bcfb6af487cbd2fed4e5581d58d5bf1f5f8b7f6727dc"
 dependencies = [
  "cc",
  "pkg-config",
@@ -6027,11 +6354,11 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
 
 [[package]]
 name = "openssl"
-version = "0.10.72"
+version = "0.10.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da"
+checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -6059,9 +6386,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.108"
+version = "0.9.109"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e145e1651e858e820e4860f7b9c5e169bc1d8ce1c86043be79fa7b7634821847"
+checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571"
 dependencies = [
  "cc",
  "libc",
@@ -6292,9 +6619,9 @@ checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba"
 
 [[package]]
 name = "parking_lot"
-version = "0.12.3"
+version = "0.12.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
+checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -6302,9 +6629,9 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.10"
+version = "0.9.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
+checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5"
 dependencies = [
  "cfg-if",
  "libc",
@@ -6319,7 +6646,7 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77144664f6aac5f629d7efa815f5098a054beeeca6ccafee5ec453fd2b0c53f9"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "ciborium",
  "coset",
  "data-encoding",
@@ -6458,10 +6785,20 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7"
 dependencies = [
- "fixedbitset",
+ "fixedbitset 0.2.0",
  "indexmap 1.9.3",
 ]
 
+[[package]]
+name = "petgraph"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
+dependencies = [
+ "fixedbitset 0.4.2",
+ "indexmap 2.9.0",
+]
+
 [[package]]
 name = "phf"
 version = "0.11.3"
@@ -6587,15 +6924,15 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
 [[package]]
 name = "polling"
-version = "3.7.4"
+version = "3.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a604568c3202727d1507653cb121dbd627a58684eb09a820fd746bee38b4442f"
+checksum = "b53a684391ad002dd6a596ceb6c74fd004fdce75f4be2e3f615068abbea5fd50"
 dependencies = [
  "cfg-if",
  "concurrent-queue",
- "hermit-abi 0.4.0",
+ "hermit-abi",
  "pin-project-lite",
- "rustix 0.38.44",
+ "rustix",
  "tracing",
  "windows-sys 0.59.0",
 ]
@@ -6653,6 +6990,12 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "precomputed-hash"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
+
 [[package]]
 name = "primeorder"
 version = "0.13.6"
@@ -6789,7 +7132,7 @@ dependencies = [
 [[package]]
 name = "prometheus-closure-metric"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "prometheus",
@@ -6828,9 +7171,9 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50"
 dependencies = [
- "bit-set",
- "bit-vec",
- "bitflags 2.9.0",
+ "bit-set 0.8.0",
+ "bit-vec 0.8.0",
+ "bitflags 2.9.1",
  "lazy_static",
  "num-traits",
  "rand 0.8.5",
@@ -6963,7 +7306,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "626214629cda6781b6dc1d316ba307189c85ba657213ce642d9c77670f8202c8"
 dependencies = [
  "bytes",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "futures-io",
  "pin-project-lite",
  "quinn-proto",
@@ -7004,7 +7347,7 @@ version = "0.5.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ee4e529991f949c5e25755532370b8af5d114acae52326361d68d47af64aa842"
 dependencies = [
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "libc",
  "once_cell",
  "socket2",
@@ -7039,6 +7382,16 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
 
+[[package]]
+name = "radix_trie"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd"
+dependencies = [
+ "endian-type",
+ "nibble_vec",
+]
+
 [[package]]
 name = "rand"
 version = "0.8.5"
@@ -7122,7 +7475,7 @@ version = "11.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
 ]
 
 [[package]]
@@ -7186,7 +7539,7 @@ version = "0.5.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
 ]
 
 [[package]]
@@ -7287,9 +7640,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest"
-version = "0.12.15"
+version = "0.12.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d19c46a6fdd48bc4dab94b6103fccc55d34c67cc0ad04653aad4ea2a07cd7bbb"
+checksum = "a2f8e5513d63f2e5b386eb5106dc67eaf3f84e95258e210489136b8b92ad6119"
 dependencies = [
  "base64 0.22.1",
  "bytes",
@@ -7316,33 +7669,31 @@ dependencies = [
  "quinn",
  "rustls",
  "rustls-native-certs",
- "rustls-pemfile",
  "rustls-pki-types",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "sync_wrapper",
- "system-configuration",
  "tokio",
  "tokio-native-tls",
  "tokio-rustls",
  "tokio-util",
  "tower 0.5.2",
+ "tower-http 0.6.5",
  "tower-service",
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
  "wasm-streams",
  "web-sys",
- "webpki-roots 0.26.11",
- "windows-registry",
+ "webpki-roots 1.0.0",
 ]
 
 [[package]]
 name = "resolv-conf"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc7c8f7f733062b66dc1c63f9db168ac0b97a9210e247fa90fdc9ad08f51b302"
+checksum = "95325155c684b1c89f7765e30bc1c42e4a6da51ca513615660cb8a62ef9a88e3"
 
 [[package]]
 name = "rfc6979"
@@ -7405,7 +7756,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b91f7eff05f748767f183df4320a63d6936e9c6107d97c9e6bdd9784f4289c94"
 dependencies = [
  "base64 0.21.7",
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "serde",
  "serde_derive",
 ]
@@ -7479,16 +7830,16 @@ dependencies = [
  "netlink-packet-utils",
  "netlink-proto",
  "netlink-sys",
- "nix",
+ "nix 0.26.4",
  "thiserror 1.0.69",
  "tokio",
 ]
 
 [[package]]
 name = "rust-embed"
-version = "8.7.1"
+version = "8.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60e425e204264b144d4c929d126d0de524b40a961686414bab5040f7465c71be"
+checksum = "025908b8682a26ba8d12f6f2d66b987584a4a87bc024abc5bbc12553a8cd178a"
 dependencies = [
  "rust-embed-impl",
  "rust-embed-utils",
@@ -7497,9 +7848,9 @@ dependencies = [
 
 [[package]]
 name = "rust-embed-impl"
-version = "8.7.0"
+version = "8.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bf418c9a2e3f6663ca38b8a7134cc2c2167c9d69688860e8961e3faa731702e"
+checksum = "6065f1a4392b71819ec1ea1df1120673418bf386f50de1d6f54204d836d4349c"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -7510,9 +7861,9 @@ dependencies = [
 
 [[package]]
 name = "rust-embed-utils"
-version = "8.7.0"
+version = "8.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d55b95147fe01265d06b3955db798bdaed52e60e2211c41137701b3aba8e21"
+checksum = "f6cc0c81648b20b70c491ff8cce00c1c3b223bb8ed2b5d41f0e54c6c4c0a3594"
 dependencies = [
  "sha2 0.10.9",
  "walkdir",
@@ -7580,29 +7931,16 @@ dependencies = [
  "nom",
 ]
 
-[[package]]
-name = "rustix"
-version = "0.38.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
-dependencies = [
- "bitflags 2.9.0",
- "errno",
- "libc",
- "linux-raw-sys 0.4.15",
- "windows-sys 0.59.0",
-]
-
 [[package]]
 name = "rustix"
 version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "errno",
  "libc",
- "linux-raw-sys 0.9.4",
+ "linux-raw-sys",
  "windows-sys 0.59.0",
 ]
 
@@ -7658,7 +7996,7 @@ version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19787cda76408ec5404443dc8b31795c87cd8fec49762dc75fa727740d34acc1"
 dependencies = [
- "core-foundation 0.10.0",
+ "core-foundation 0.10.1",
  "core-foundation-sys",
  "jni",
  "log",
@@ -7702,9 +8040,9 @@ dependencies = [
 
 [[package]]
 name = "rustversion"
-version = "1.0.20"
+version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2"
+checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d"
 
 [[package]]
 name = "rusty-fork"
@@ -7718,6 +8056,28 @@ dependencies = [
  "wait-timeout",
 ]
 
+[[package]]
+name = "rustyline"
+version = "14.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7803e8936da37efd9b6d4478277f4b2b9bb5cdb37a113e8d63222e58da647e63"
+dependencies = [
+ "bitflags 2.9.1",
+ "cfg-if",
+ "clipboard-win",
+ "fd-lock",
+ "home",
+ "libc",
+ "log",
+ "memchr",
+ "nix 0.28.0",
+ "radix_trie",
+ "unicode-segmentation",
+ "unicode-width 0.1.14",
+ "utf8parse",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "rw-stream-sink"
 version = "0.4.0"
@@ -7762,6 +8122,48 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "schemafy"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8aea5ba40287dae331f2c48b64dbc8138541f5e97ee8793caa7948c1f31d86d5"
+dependencies = [
+ "Inflector",
+ "schemafy_core",
+ "schemafy_lib",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "serde_repr",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "schemafy_core"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41781ae092f4fd52c9287efb74456aea0d3b90032d2ecad272bd14dbbcb0511b"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemafy_lib"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e953db32579999ca98c451d80801b6f6a7ecba6127196c5387ec0774c528befa"
+dependencies = [
+ "Inflector",
+ "proc-macro2",
+ "quote",
+ "schemafy_core",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "schemars"
 version = "0.8.22"
@@ -7858,7 +8260,7 @@ version = "2.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "core-foundation 0.9.4",
  "core-foundation-sys",
  "libc",
@@ -7871,8 +8273,8 @@ version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316"
 dependencies = [
- "bitflags 2.9.0",
- "core-foundation 0.10.0",
+ "bitflags 2.9.1",
+ "core-foundation 0.10.1",
  "core-foundation-sys",
  "libc",
  "security-framework-sys",
@@ -8267,7 +8669,7 @@ dependencies = [
 [[package]]
 name = "shared-crypto"
 version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "bcs",
  "eyre",
@@ -8386,18 +8788,18 @@ dependencies = [
 
 [[package]]
 name = "snafu"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019"
+checksum = "320b01e011bf8d5d7a4a4a4be966d9160968935849c83b918827f6a435e7f627"
 dependencies = [
  "snafu-derive",
 ]
 
 [[package]]
 name = "snafu-derive"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917"
+checksum = "1961e2ef424c1424204d3a5d6975f934f56b6d50ff5732382d84ebf460e147f7"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
@@ -8430,9 +8832,9 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.5.9"
+version = "0.5.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
+checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -8597,7 +8999,7 @@ checksum = "aa003f0038df784eb8fecbbac13affe3da23b45194bd57dba231c8f48199c526"
 dependencies = [
  "atoi",
  "base64 0.22.1",
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "byteorder",
  "bytes",
  "crc",
@@ -8639,7 +9041,7 @@ checksum = "db58fcd5a53cf07c184b154801ff91347e4c30d17a3562a635ff028ad5deda46"
 dependencies = [
  "atoi",
  "base64 0.22.1",
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "byteorder",
  "crc",
  "dotenvy",
@@ -8711,12 +9113,114 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "starlark"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f53849859f05d9db705b221bd92eede93877fd426c1b4a3c3061403a5912a8f"
+dependencies = [
+ "allocative",
+ "anyhow",
+ "bumpalo",
+ "cmp_any",
+ "debugserver-types",
+ "derivative",
+ "derive_more 1.0.0",
+ "display_container",
+ "dupe",
+ "either",
+ "erased-serde",
+ "hashbrown 0.14.5",
+ "inventory",
+ "itertools 0.13.0",
+ "maplit",
+ "memoffset",
+ "num-bigint 0.4.6",
+ "num-traits",
+ "once_cell",
+ "paste",
+ "ref-cast",
+ "regex",
+ "rustyline",
+ "serde",
+ "serde_json",
+ "starlark_derive",
+ "starlark_map",
+ "starlark_syntax",
+ "static_assertions",
+ "strsim 0.10.0",
+ "textwrap",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "starlark_derive"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe58bc6c8b7980a1fe4c9f8f48200c3212db42ebfe21ae6a0336385ab53f082a"
+dependencies = [
+ "dupe",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.101",
+]
+
+[[package]]
+name = "starlark_map"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92659970f120df0cc1c0bb220b33587b7a9a90e80d4eecc5c5af5debb950173d"
+dependencies = [
+ "allocative",
+ "dupe",
+ "equivalent",
+ "fxhash",
+ "hashbrown 0.14.5",
+ "serde",
+]
+
+[[package]]
+name = "starlark_syntax"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe53b3690d776aafd7cb6b9fed62d94f83280e3b87d88e3719cc0024638461b3"
+dependencies = [
+ "allocative",
+ "annotate-snippets",
+ "anyhow",
+ "derivative",
+ "derive_more 1.0.0",
+ "dupe",
+ "lalrpop",
+ "lalrpop-util",
+ "logos",
+ "lsp-types 0.94.1",
+ "memchr",
+ "num-bigint 0.4.6",
+ "num-traits",
+ "once_cell",
+ "starlark_map",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 
+[[package]]
+name = "string_cache"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
+dependencies = [
+ "new_debug_unreachable",
+ "parking_lot",
+ "phf_shared",
+ "precomputed-hash",
+]
+
 [[package]]
 name = "stringprep"
 version = "0.1.5"
@@ -8799,7 +9303,7 @@ checksum = "734676eb262c623cec13c3155096e08d1f8f29adce39ba17948b18dad1e54142"
 [[package]]
 name = "sui-config"
 version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anemo",
  "anyhow",
@@ -8818,19 +9322,22 @@ dependencies = [
  "rand 0.8.5",
  "reqwest",
  "serde",
+ "serde_json",
  "serde_with",
  "serde_yaml 0.8.26",
+ "starlark",
  "sui-keys",
  "sui-protocol-config",
  "sui-rpc-api",
  "sui-types",
+ "thiserror 1.0.69",
  "tracing",
 ]
 
 [[package]]
 name = "sui-enum-compat-util"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "serde_yaml 0.8.26",
 ]
@@ -8838,7 +9345,7 @@ dependencies = [
 [[package]]
 name = "sui-http"
 version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "bytes",
  "http 1.3.1",
@@ -8858,7 +9365,7 @@ dependencies = [
 [[package]]
 name = "sui-json"
 version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bcs",
@@ -8875,7 +9382,7 @@ dependencies = [
 [[package]]
 name = "sui-json-rpc-api"
 version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "fastcrypto 0.1.8",
@@ -8895,7 +9402,7 @@ dependencies = [
 [[package]]
 name = "sui-json-rpc-types"
 version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bcs",
@@ -8927,7 +9434,7 @@ dependencies = [
 [[package]]
 name = "sui-keys"
 version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "bip32",
@@ -8946,7 +9453,7 @@ dependencies = [
 [[package]]
 name = "sui-macros"
 version = "0.7.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "futures",
  "once_cell",
@@ -8956,8 +9463,8 @@ dependencies = [
 
 [[package]]
 name = "sui-open-rpc"
-version = "1.47.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+version = "1.49.1"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "bcs",
  "schemars",
@@ -8969,7 +9476,7 @@ dependencies = [
 [[package]]
 name = "sui-open-rpc-macros"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "derive-syn-parse",
  "itertools 0.13.0",
@@ -8982,7 +9489,7 @@ dependencies = [
 [[package]]
 name = "sui-package-resolver"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "async-trait",
  "bcs",
@@ -9001,7 +9508,7 @@ dependencies = [
 [[package]]
 name = "sui-proc-macros"
 version = "0.7.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "msim-macros",
  "proc-macro2",
@@ -9013,7 +9520,7 @@ dependencies = [
 [[package]]
 name = "sui-protocol-config"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "clap",
  "move-vm-config",
@@ -9028,7 +9535,7 @@ dependencies = [
 [[package]]
 name = "sui-protocol-config-macros"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -9038,7 +9545,7 @@ dependencies = [
 [[package]]
 name = "sui-rpc-api"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -9064,7 +9571,6 @@ dependencies = [
  "serde_with",
  "sui-protocol-config",
  "sui-sdk-types",
- "sui-transaction-builder 0.0.4",
  "sui-types",
  "tap",
  "thiserror 1.0.69",
@@ -9081,8 +9587,8 @@ dependencies = [
 
 [[package]]
 name = "sui-sdk"
-version = "1.47.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+version = "1.49.1"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -9105,7 +9611,7 @@ dependencies = [
  "sui-json-rpc-api",
  "sui-json-rpc-types",
  "sui-keys",
- "sui-transaction-builder 0.0.0",
+ "sui-transaction-builder",
  "sui-types",
  "thiserror 1.0.69",
  "tokio",
@@ -9131,32 +9637,10 @@ dependencies = [
  "winnow",
 ]
 
-[[package]]
-name = "sui-tls"
-version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
-dependencies = [
- "anyhow",
- "arc-swap",
- "axum 0.8.4",
- "axum-server",
- "ed25519",
- "fastcrypto 0.1.8",
- "pkcs8 0.10.2",
- "rcgen",
- "reqwest",
- "rustls",
- "rustls-webpki 0.103.3",
- "tokio",
- "tokio-rustls",
- "tower-layer",
- "x509-parser 0.17.0",
-]
-
 [[package]]
 name = "sui-transaction-builder"
 version = "0.0.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -9170,24 +9654,10 @@ dependencies = [
  "sui-types",
 ]
 
-[[package]]
-name = "sui-transaction-builder"
-version = "0.0.4"
-source = "git+https://github.com/MystenLabs/sui-rust-sdk.git?rev=83ff809bc11cbabda21b60130e1f5420170548bf#83ff809bc11cbabda21b60130e1f5420170548bf"
-dependencies = [
- "base64ct",
- "bcs",
- "serde",
- "serde_json",
- "serde_with",
- "sui-sdk-types",
- "thiserror 2.0.12",
-]
-
 [[package]]
 name = "sui-types"
 version = "0.1.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "anemo",
  "anyhow",
@@ -9215,6 +9685,7 @@ dependencies = [
  "move-core-types",
  "move-vm-profiler",
  "move-vm-test-utils",
+ "mysten-common",
  "mysten-metrics",
  "mysten-network",
  "nonempty",
@@ -9328,7 +9799,7 @@ version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "core-foundation 0.9.4",
  "system-configuration-sys",
 ]
@@ -9388,10 +9859,21 @@ dependencies = [
  "fastrand",
  "getrandom 0.3.3",
  "once_cell",
- "rustix 1.0.7",
+ "rustix",
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "term"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
+dependencies = [
+ "dirs-next",
+ "rustversion",
+ "winapi",
+]
+
 [[package]]
 name = "termcolor"
 version = "1.4.1"
@@ -9407,10 +9889,19 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "45c6481c4829e4cc63825e62c49186a34538b7b2750b73b266581ffb612fb5ed"
 dependencies = [
- "rustix 1.0.7",
+ "rustix",
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "textwrap"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
+dependencies = [
+ "unicode-width 0.1.14",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -9630,7 +10121,7 @@ dependencies = [
 [[package]]
 name = "tokio-rustls"
 version = "0.26.2"
-source = "git+https://github.com/rustls/tokio-rustls?branch=main#8092a899759480b86544207c434b58ace3083346"
+source = "git+https://github.com/rustls/tokio-rustls?branch=main#6a775e132632340d7f788cf1eba1f618d0d9e7b2"
 dependencies = [
  "rustls",
  "tokio",
@@ -9897,7 +10388,7 @@ checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5"
 dependencies = [
  "async-compression",
  "base64 0.21.7",
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "bytes",
  "futures-core",
  "futures-util",
@@ -9922,14 +10413,18 @@ dependencies = [
 
 [[package]]
 name = "tower-http"
-version = "0.6.4"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fdb0c213ca27a9f57ab69ddb290fd80d970922355b83ae380b395d3986b8a2e"
+checksum = "5cc2d9e086a412a451384326f521c8123a99a466b329941a9403696bff9b0da2"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
  "bytes",
+ "futures-util",
  "http 1.3.1",
+ "http-body",
+ "iri-string",
  "pin-project-lite",
+ "tower 0.5.2",
  "tower-layer",
  "tower-service",
 ]
@@ -10110,7 +10605,7 @@ dependencies = [
 [[package]]
 name = "typed-store-error"
 version = "0.4.0"
-source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.47.0#f3e72b60708b682f1d286b8d218977c4338e39d9"
+source = "git+https://github.com/mystenlabs/sui?tag=testnet-v1.49.1#3b1d6b3bd63f175b774da557f89af3619b74d783"
 dependencies = [
  "serde",
  "thiserror 1.0.69",
@@ -10400,13 +10895,15 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.16.0"
+version = "1.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
+checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d"
 dependencies = [
  "getrandom 0.3.3",
+ "js-sys",
  "rand 0.9.1",
  "serde",
+ "wasm-bindgen",
 ]
 
 [[package]]
@@ -10741,12 +11238,24 @@ dependencies = [
 
 [[package]]
 name = "windows"
-version = "0.58.0"
+version = "0.61.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd04d41d93c4992d421894c18c8b43496aa748dd4c081bac0dc93eb0489272b6"
+checksum = "c5ee8f3d025738cb02bad7868bbb5f8a6327501e870bf51f1b455b0a2454a419"
 dependencies = [
- "windows-core 0.58.0",
- "windows-targets 0.52.6",
+ "windows-collections",
+ "windows-core 0.61.2",
+ "windows-future",
+ "windows-link",
+ "windows-numerics",
+]
+
+[[package]]
+name = "windows-collections"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8"
+dependencies = [
+ "windows-core 0.61.2",
 ]
 
 [[package]]
@@ -10773,46 +11282,33 @@ dependencies = [
 
 [[package]]
 name = "windows-core"
-version = "0.58.0"
+version = "0.61.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ba6d44ec8c2591c134257ce647b7ea6b20335bf6379a27dac5f1641fcf59f99"
-dependencies = [
- "windows-implement 0.58.0",
- "windows-interface 0.58.0",
- "windows-result 0.2.0",
- "windows-strings 0.1.0",
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-core"
-version = "0.61.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4763c1de310c86d75a878046489e2e5ba02c649d185f21c67d4cf8a56d098980"
+checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
 dependencies = [
  "windows-implement 0.60.0",
  "windows-interface 0.59.1",
  "windows-link",
- "windows-result 0.3.2",
- "windows-strings 0.4.0",
+ "windows-result 0.3.4",
+ "windows-strings 0.4.2",
 ]
 
 [[package]]
-name = "windows-implement"
-version = "0.57.0"
+name = "windows-future"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7"
+checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e"
 dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.101",
+ "windows-core 0.61.2",
+ "windows-link",
+ "windows-threading",
 ]
 
 [[package]]
 name = "windows-implement"
-version = "0.58.0"
+version = "0.57.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b"
+checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -10841,17 +11337,6 @@ dependencies = [
  "syn 2.0.101",
 ]
 
-[[package]]
-name = "windows-interface"
-version = "0.58.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.101",
-]
-
 [[package]]
 name = "windows-interface"
 version = "0.59.1"
@@ -10869,13 +11354,23 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
 
+[[package]]
+name = "windows-numerics"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1"
+dependencies = [
+ "windows-core 0.61.2",
+ "windows-link",
+]
+
 [[package]]
 name = "windows-registry"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4286ad90ddb45071efd1a66dfa43eb02dd0dfbae1545ad6cc3c51cf34d7e8ba3"
 dependencies = [
- "windows-result 0.3.2",
+ "windows-result 0.3.4",
  "windows-strings 0.3.1",
  "windows-targets 0.53.0",
 ]
@@ -10891,32 +11386,13 @@ dependencies = [
 
 [[package]]
 name = "windows-result"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-result"
-version = "0.3.2"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c64fd11a4fd95df68efcfee5f44a294fe71b8bc6a91993e2791938abcc712252"
+checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
 dependencies = [
  "windows-link",
 ]
 
-[[package]]
-name = "windows-strings"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10"
-dependencies = [
- "windows-result 0.2.0",
- "windows-targets 0.52.6",
-]
-
 [[package]]
 name = "windows-strings"
 version = "0.3.1"
@@ -10928,9 +11404,9 @@ dependencies = [
 
 [[package]]
 name = "windows-strings"
-version = "0.4.0"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a2ba9642430ee452d5a7aa78d72907ebe8cfda358e8cb7918a2050581322f97"
+checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
 dependencies = [
  "windows-link",
 ]
@@ -11033,6 +11509,15 @@ dependencies = [
  "windows_x86_64_msvc 0.53.0",
 ]
 
+[[package]]
+name = "windows-threading"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.2"
@@ -11238,7 +11723,7 @@ version = "0.39.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
 dependencies = [
- "bitflags 2.9.0",
+ "bitflags 2.9.1",
 ]
 
 [[package]]
@@ -11373,16 +11858,16 @@ dependencies = [
 
 [[package]]
 name = "yamux"
-version = "0.13.4"
+version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17610762a1207ee816c6fadc29220904753648aba0a9ed61c7b8336e80a559c4"
+checksum = "3da1acad1c2dc53f0dde419115a38bd8221d8c3e47ae9aeceaf453266d29307e"
 dependencies = [
  "futures",
  "log",
  "nohash-hasher",
  "parking_lot",
  "pin-project",
- "rand 0.8.5",
+ "rand 0.9.1",
  "static_assertions",
  "web-time",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 8b15b490..c152f46e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -70,8 +70,8 @@ serde_yaml                  = "0.9.34"
 serial_test                 = "3.1.1"
 sha2                        = "0.10.9"
 sqlx                        = "0.8.6"
-sui-keys                    = { git = "https://github.com/mystenlabs/sui", package = "sui-keys", tag = "testnet-v1.47.0" }
-sui-sdk                     = { git = "https://github.com/mystenlabs/sui", package = "sui-sdk", tag = "testnet-v1.47.0" }
+sui-keys                    = { git = "https://github.com/mystenlabs/sui", package = "sui-keys", tag = "testnet-v1.49.1" }
+sui-sdk                     = { git = "https://github.com/mystenlabs/sui", package = "sui-sdk", tag = "testnet-v1.49.1" }
 sysinfo                     = "0.33.1"
 tempfile                    = "3.20.0"
 thiserror                   = "2.0.12"
diff --git a/atoma-bin/atoma_node.rs b/atoma-bin/atoma_node.rs
index 16f87281..f28a2321 100644
--- a/atoma-bin/atoma_node.rs
+++ b/atoma-bin/atoma_node.rs
@@ -213,11 +213,13 @@ async fn main() -> Result<()> {
 
     let keystore = FileBasedKeystore::new(&config.sui.sui_keystore_path().into())
         .context("Failed to initialize keystore")?;
-    let mut wallet_ctx = WalletContext::new(
-        &PathBuf::from(config.sui.sui_config_path()),
-        config.sui.request_timeout(),
-        config.sui.max_concurrent_requests(),
-    )?;
+    let mut wallet_ctx = WalletContext::new(&PathBuf::from(config.sui.sui_config_path()))?;
+    if let Some(request_timeout) = config.sui.request_timeout() {
+        wallet_ctx = wallet_ctx.with_request_timeout(request_timeout);
+    }
+    if let Some(max_concurrent_requests) = config.sui.max_concurrent_requests() {
+        wallet_ctx = wallet_ctx.with_max_concurrent_requests(max_concurrent_requests);
+    }
     let address = wallet_ctx.active_address()?;
     let address_index = args.address_index.unwrap_or_else(|| {
         wallet_ctx
diff --git a/atoma-sui/src/client.rs b/atoma-sui/src/client.rs
index 43ac0237..0e078f34 100644
--- a/atoma-sui/src/client.rs
+++ b/atoma-sui/src/client.rs
@@ -86,11 +86,13 @@ impl Client {
     pub async fn new(config: SuiConfig) -> Result<Self> {
         let sui_config_path = config.sui_config_path();
         let sui_config_path = Path::new(&sui_config_path);
-        let mut wallet_ctx = WalletContext::new(
-            sui_config_path,
-            config.request_timeout(),
-            config.max_concurrent_requests(),
-        )?;
+        let mut wallet_ctx = WalletContext::new(sui_config_path)?;
+        if let Some(request_timeout) = config.request_timeout() {
+            wallet_ctx = wallet_ctx.with_request_timeout(request_timeout);
+        }
+        if let Some(max_concurrent_requests) = config.max_concurrent_requests() {
+            wallet_ctx = wallet_ctx.with_max_concurrent_requests(max_concurrent_requests);
+        }
         let active_address = wallet_ctx.active_address()?;
         info!("Current active address: {}", active_address);
         let node_badge = utils::get_node_badge(

From 9b67457a3f2238bfa53dea21b96a64655f338dc6 Mon Sep 17 00:00:00 2001
From: Martin Stefcek <35243812+Cifko@users.noreply.github.com>
Date: Wed, 4 Jun 2025 18:04:11 +0200
Subject: [PATCH 11/13] feat: add max number of queued requests configuration
 and update request handling (#656)

---
 atoma-bin/atoma_node.rs                        |  3 ++-
 atoma-service/src/config.rs                    |  4 ++++
 atoma-service/src/handlers/chat_completions.rs | 10 ++++++++--
 atoma-service/src/handlers/completions.rs      | 10 ++++++++--
 atoma-service/src/handlers/mod.rs              | 10 ++++++++++
 atoma-service/src/middleware.rs                | 11 ++++++++++-
 atoma-service/src/server.rs                    |  7 +++++--
 atoma-service/src/tests.rs                     |  3 ++-
 config.example.toml                            |  1 +
 9 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/atoma-bin/atoma_node.rs b/atoma-bin/atoma_node.rs
index f28a2321..ed234a1c 100644
--- a/atoma-bin/atoma_node.rs
+++ b/atoma-bin/atoma_node.rs
@@ -375,11 +375,12 @@ async fn main() -> Result<()> {
         keystore: Arc::new(keystore),
         address_index,
         whitelist_sui_addresses_for_fiat: config.service.whitelist_sui_addresses_for_fiat,
-        too_many_requests: Arc::new(DashSet::new()),
+        too_many_requests: Arc::new(DashMap::new()),
         too_many_requests_timeout_ms: u128::from(config.service.too_many_requests_timeout_ms),
         running_num_requests: Arc::new(RequestCounter::new()),
         memory_lower_threshold: config.service.memory_lower_threshold,
         memory_upper_threshold: config.service.memory_upper_threshold,
+        max_num_queued_requests: config.service.max_num_queued_requests,
     };
 
     let chat_completions_service_urls = app_state
diff --git a/atoma-service/src/config.rs b/atoma-service/src/config.rs
index e6725ff3..9e2aa509 100644
--- a/atoma-service/src/config.rs
+++ b/atoma-service/src/config.rs
@@ -63,8 +63,12 @@ pub struct AtomaServiceConfig {
 
     ///Lower threshold for memory usage, if the memory usage goes below this value, the service will not be considered overloaded
     pub memory_lower_threshold: f64,
+
     /// Upper threshold for memory usage, if the memory usage goes above this value, the service will be considered overloaded
     pub memory_upper_threshold: f64,
+
+    /// The maximum number of queued requests for each inference service.
+    pub max_num_queued_requests: f64,
 }
 
 impl AtomaServiceConfig {
diff --git a/atoma-service/src/handlers/chat_completions.rs b/atoma-service/src/handlers/chat_completions.rs
index 20fa6ae9..f53815c1 100644
--- a/atoma-service/src/handlers/chat_completions.rs
+++ b/atoma-service/src/handlers/chat_completions.rs
@@ -909,6 +909,7 @@ async fn handle_streaming_response(
             chat_completions_service_urls,
             &model.to_lowercase(),
             state.memory_upper_threshold,
+            state.max_num_queued_requests,
         )
         .await
         .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
@@ -916,7 +917,9 @@ async fn handle_streaming_response(
             endpoint: endpoint.clone(),
         })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
-        state.too_many_requests.insert(model.to_string());
+        state
+            .too_many_requests
+            .insert(model.to_string(), Instant::now());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -1341,6 +1344,7 @@ pub mod utils {
                 chat_completions_service_url_services,
                 model,
                 state.memory_upper_threshold,
+                state.max_num_queued_requests,
             )
             .await
             .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
@@ -1348,7 +1352,9 @@ pub mod utils {
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
-            state.too_many_requests.insert(model.to_string());
+            state
+                .too_many_requests
+                .insert(model.to_string(), Instant::now());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
diff --git a/atoma-service/src/handlers/completions.rs b/atoma-service/src/handlers/completions.rs
index 0d80a52f..4b44ded0 100644
--- a/atoma-service/src/handlers/completions.rs
+++ b/atoma-service/src/handlers/completions.rs
@@ -882,6 +882,7 @@ async fn handle_streaming_response(
         chat_completions_service_urls,
         model,
         state.memory_upper_threshold,
+        state.max_num_queued_requests,
     )
     .await
     .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
@@ -889,7 +890,9 @@ async fn handle_streaming_response(
         endpoint: endpoint.clone(),
     })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
-        state.too_many_requests.insert(model.to_string());
+        state
+            .too_many_requests
+            .insert(model.to_string(), Instant::now());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -1303,6 +1306,7 @@ pub mod utils {
                 completions_service_url_services,
                 model,
                 state.memory_upper_threshold,
+                state.max_num_queued_requests,
             )
             .await
             .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
@@ -1310,7 +1314,9 @@ pub mod utils {
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
-            state.too_many_requests.insert(model.to_string());
+            state
+                .too_many_requests
+                .insert(model.to_string(), Instant::now());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
diff --git a/atoma-service/src/handlers/mod.rs b/atoma-service/src/handlers/mod.rs
index 085163dd..ed5dadab 100644
--- a/atoma-service/src/handlers/mod.rs
+++ b/atoma-service/src/handlers/mod.rs
@@ -1147,6 +1147,7 @@ pub mod inference_service_metrics {
         chat_completions_service_urls: &[(String, String, usize)], // (url, job, max_concurrent_requests)
         model: &str,
         memory_upper_threshold: f64,
+        max_num_queued_requests: f64,
     ) -> Result<(String, StatusCode)> {
         let mut metrics_results = get_all_metrics(chat_completions_service_urls, model)
             .await
@@ -1178,6 +1179,15 @@ pub mod inference_service_metrics {
                 &metric.chat_completions_service_url,
                 metric.max_number_of_running_requests,
             ) {
+                if metric.num_queued_requests > max_num_queued_requests {
+                    tracing::debug!(
+                        target = "atoma-service",
+                        level = "debug",
+                        "Number of queued requests for model: {model} is too high: {}",
+                        metric.num_queued_requests
+                    );
+                    continue;
+                }
                 if metric.above_upper_threshold_exceeded(memory_upper_threshold) {
                     tracing::debug!(
                         target = "atoma-service",
diff --git a/atoma-service/src/middleware.rs b/atoma-service/src/middleware.rs
index 701b986a..010d8667 100644
--- a/atoma-service/src/middleware.rs
+++ b/atoma-service/src/middleware.rs
@@ -1613,7 +1613,16 @@ pub mod utils {
         model: &str,
         endpoint: &str,
     ) -> Result<(), AtomaServiceError> {
-        if state.too_many_requests.get(model).is_some() {
+        if let Some(a) = state.too_many_requests.get(model) {
+            if a.elapsed().as_millis() < state.too_many_requests_timeout_ms {
+                tracing::debug!(
+                    target = "atoma-service",
+                    level = "debug",
+                    "Model {} is in the `too_many_requests` map, but the elapsed time since the first occurrence is less than the timeout.",
+                    model
+                );
+                return Ok(());
+            }
             let chat_completions_service_urls = state
                 .chat_completions_service_urls
                 .get(&model.to_lowercase())
diff --git a/atoma-service/src/server.rs b/atoma-service/src/server.rs
index 7e132116..575870ff 100644
--- a/atoma-service/src/server.rs
+++ b/atoma-service/src/server.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc, time::Instant};
 
 use atoma_confidential::types::{
     ConfidentialComputeDecryptionRequest, ConfidentialComputeDecryptionResponse,
@@ -207,7 +207,7 @@ pub struct AppState {
     pub whitelist_sui_addresses_for_fiat: Vec<String>,
 
     /// When was the too many requests triggered for each model.
-    pub too_many_requests: Arc<DashSet<String>>,
+    pub too_many_requests: Arc<DashMap<String, Instant>>,
 
     /// The time for which we triiger too many requests since the first occurrence.
     pub too_many_requests_timeout_ms: u128,
@@ -222,6 +222,9 @@ pub struct AppState {
     /// The lower memory threshold for the node.
     /// This threshold is used to determine when the node can start accepting requests again.
     pub memory_lower_threshold: f64,
+
+    /// The maximum number of queued requests for each inference service.
+    pub max_num_queued_requests: f64,
 }
 
 /// Creates and configures the main router for the application.
diff --git a/atoma-service/src/tests.rs b/atoma-service/src/tests.rs
index b601a696..c180ab41 100644
--- a/atoma-service/src/tests.rs
+++ b/atoma-service/src/tests.rs
@@ -341,11 +341,12 @@ mod middleware {
                 address_index: 0,
                 stack_retrieve_sender,
                 whitelist_sui_addresses_for_fiat: vec![],
-                too_many_requests: Arc::new(DashSet::new()),
+                too_many_requests: Arc::new(DashMap::new()),
                 too_many_requests_timeout_ms: 0,
                 running_num_requests: Arc::new(RequestCounter::new()),
                 memory_lower_threshold: 1.0,
                 memory_upper_threshold: 1.0,
+                max_num_queued_requests: 0.0,
             },
             public_key,
             signature,
diff --git a/config.example.toml b/config.example.toml
index 1a519842..7a91191c 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -46,6 +46,7 @@ image_generations_service_url = "http://image-generations:80"
 # List of models to be used by the service, the current value here is just a placeholder, please change it to the models you want to deploy
 environment                      = "development"                                       # or "production" (for use in sentry, you need to set the Sentry DSN)
 heartbeat_url                    = "my-heartbeat-url"
+max_num_queued_requests          = 1                                                   # Maximum number of queued requests for each inference service, this is used to limit the number of requests that can be queued for each service, if the number of queued requests exceeds this value, the service will be considered overloaded and will not accept new requests
 memory_lower_threshold           = 0.75                                                # Lower threshold for memory usage, if the memory usage goes below this value, the service will not be considered overloaded
 memory_upper_threshold           = 0.9                                                 # Upper threshold for memory usage, if the memory usage goes above this value, the service will be considered overloaded
 metrics_update_interval          = 30

From 5d17fabebd223e59245999b72441ce629d4158c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= <matroid@outlook.com>
Date: Wed, 4 Jun 2025 20:12:28 +0100
Subject: [PATCH 12/13] fix: correct deadlock in `check_if_too_many_requests`
 (#658)

* correct deadlock in check_if_too_many_requests method

* resolve tests

* add changes

* add changes

* continue improving logic

* add changes
---
 atoma-service/src/middleware.rs | 134 ++++++++++++++++++++++++--------
 atoma-service/src/tests.rs      |  24 +++++-
 2 files changed, 123 insertions(+), 35 deletions(-)

diff --git a/atoma-service/src/middleware.rs b/atoma-service/src/middleware.rs
index 010d8667..e2bd608e 100644
--- a/atoma-service/src/middleware.rs
+++ b/atoma-service/src/middleware.rs
@@ -479,7 +479,14 @@ async fn generate_request_from_stack(
                     message: format!("Tx digest cannot be converted to a string, with error: {e}"),
                     endpoint: endpoint.clone(),
                 })?;
-            let tx_digest = TransactionDigest::from_str(tx_digest_str).unwrap();
+            let tx_digest = TransactionDigest::from_str(tx_digest_str).map_err(|e| {
+                AtomaServiceError::InvalidHeader {
+                    message: format!(
+                        "Tx digest is not a valid transaction digest, with error: {e}"
+                    ),
+                    endpoint: endpoint.clone(),
+                }
+            })?;
             utils::request_blockchain_for_stack(
                 &state,
                 tx_digest,
@@ -1590,44 +1597,98 @@ pub mod utils {
         Ok(())
     }
 
-    /// Checks if the model has too many requests.
+    /// Checks if a given model is currently flagged for "too many requests" and,
+    /// if so, whether it should be unflagged based on a timeout or current service metrics.
     ///
-    /// This function checks if the model has too many requests by checking if the elapsed time since the first occurrence is less than the timeout.
+    /// This function implements a cooldown mechanism for models that have recently
+    /// triggered a "too many requests" (429) status.
     ///
     /// # Arguments
-    /// * `state` - The application state containing the too many requests map
-    /// * `model` - The model to check
-    /// * `endpoint` - The API endpoint path being accessed (used for error context)
+    ///
+    /// * `state`: A reference to the application's shared state (`AppState`), which includes:
+    ///     - `too_many_requests`: A `DashMap` tracking models currently in a "too many requests" state and when they entered it.
+    ///     - `too_many_requests_timeout_ms`: The duration (in milliseconds) a model stays flagged before its status is re-evaluated based on metrics.
+    ///     - `chat_completions_service_urls`: A `DashMap` containing the service URLs for different models.
+    ///     - `memory_lower_threshold`: A threshold used to determine if a service's memory usage is low enough to consider it recovered.
+    /// * `model`: The name of the model to check.
+    /// * `endpoint`: The API endpoint path where the request was received (used for error reporting).
     ///
     /// # Returns
-    /// * `Ok(())` - If the model has too many requests
-    /// * `Err(AtomaServiceError)` - If the model has too many requests
     ///
-    /// # Errors
-    /// This function will return an error if:
-    /// - The model has too many requests
-    /// - The elapsed time since the first occurrence is less than the timeout
+    /// * `Ok(())`: If the model is not currently restricted, or if it was restricted but has now been unflagged.
+    /// * `Err(AtomaServiceError::ChatCompletionsServiceUnavailable)`: If the model is currently flagged for "too many requests" and the timeout period has not yet elapsed.
+    /// * `Err(AtomaServiceError::InternalError)`: If there's an issue fetching service URLs or metrics.
+    ///
+    /// # Logic Flow
+    ///
+    /// 1.  **Initial "Too Many Requests" Check:**
+    ///     - It first attempts to access the `model` in the `state.too_many_requests` map using `entry()`.
+    ///     - If the model is found (Occupied entry):
+    ///         - It retrieves the `Instant` when the model was flagged.
+    ///         - It calculates the time elapsed since flagging.
+    ///         - If `elapsed_ms` is less than `state.too_many_requests_timeout_ms`, the function immediately
+    ///           returns `Err(AtomaServiceError::ChatCompletionsServiceUnavailable)`, indicating the model is still in a cooldown period.
+    ///         - If the timeout has passed, the entry for the model is removed from `state.too_many_requests`. This effectively
+    ///           clears the "too many requests" flag based on the timeout, regardless of current metrics at this stage.
+    ///     - If the model is not found (Vacant entry), it means the model isn't currently flagged for "too many requests" from a previous direct 429 response.
+    ///
+    /// 2.  **Metrics-Based Re-evaluation (if not returned early):**
+    ///     - The function proceeds to fetch the service URLs for the given `model`.
+    ///     - It then asynchronously calls `get_all_metrics` to retrieve current operational metrics for these services.
+    ///     - It checks if any of the retrieved metrics indicate that the service's memory usage is now below `state.memory_lower_threshold`.
+    ///
+    /// 3.  **Final "Too Many Requests" State Update:**
+    ///     - If the metrics show that the service is under the lower memory threshold (indicating potential recovery):
+    ///         - It attempts to remove the `model` from `state.too_many_requests` again. This handles cases where the model might have been
+    ///           re-added by another concurrent request between the initial check and metrics retrieval, or if it was never there but
+    ///           the metrics now allow it.
+    ///     - If the metrics do not show the service is under the lower threshold, no further action is taken on the `too_many_requests` map at this point
+    ///       (it might have been removed by timeout earlier, or was never there).
+    ///
+    /// 4.  The function then returns `Ok(())` if it hasn't returned an error earlier.
+    ///
+    /// # Deadlock Safety
+    ///
+    /// The function is designed to be deadlock-safe with respect to `DashMap` operations:
+    /// - The lock acquired by `state.too_many_requests.entry()` is released before any `.await` point.
+    /// - Subsequent operations on `state.too_many_requests` (like the second `remove` call) acquire new, independent locks.
     #[instrument(level = "info", skip_all, err)]
     pub async fn check_if_too_many_requests(
         state: &AppState,
         model: &str,
         endpoint: &str,
     ) -> Result<(), AtomaServiceError> {
-        if let Some(a) = state.too_many_requests.get(model) {
-            if a.elapsed().as_millis() < state.too_many_requests_timeout_ms {
+        match state.too_many_requests.entry(model.to_string()) {
+            dashmap::mapref::entry::Entry::Occupied(occupied_entry) => {
+                let elapsed_ms = occupied_entry.get().elapsed().as_millis();
+
+                if elapsed_ms < state.too_many_requests_timeout_ms {
+                    tracing::info!(
+                            target = "atoma-service",
+                            level = "info",
+                            "Too many requests for model: {model}, endpoint: {endpoint}, elapsed trigger time: {elapsed_ms} and timeout: {}",
+                            state.too_many_requests_timeout_ms
+                        );
+                    return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
+                        message: "Too many requests".to_string(),
+                        endpoint: endpoint.to_string(),
+                    });
+                }
+                occupied_entry.remove();
+            }
+            dashmap::mapref::entry::Entry::Vacant(_) => {
                 tracing::debug!(
                     target = "atoma-service",
                     level = "debug",
-                    "Model {} is in the `too_many_requests` map, but the elapsed time since the first occurrence is less than the timeout.",
-                    model
+                    "Model is not in the `too_many_requests` map, so no action is needed here. Processing can continue."
                 );
-                return Ok(());
             }
-            let chat_completions_service_urls = state
+        }
+        let chat_completions_service_urls = state
                 .chat_completions_service_urls
                 .get(&model.to_lowercase())
                 .ok_or_else(|| {
-                    AtomaServiceError::InternalError {
+                    AtomaServiceError::InvalidBody {
                         message: format!(
                             "Chat completions service URL not found, likely that model is not supported by the current node: {}",
                             model
@@ -1635,30 +1696,35 @@ pub mod utils {
                         endpoint: endpoint.to_string(),
                     }
                 })?;
-            let metrics = get_all_metrics(chat_completions_service_urls, model)
-                .await
-                .map_err(|e| AtomaServiceError::InternalError {
-                    message: format!("Failed to get metrics for model {model}, with error: {e}"),
-                    endpoint: endpoint.to_string(),
-                })?;
-            if metrics
-                .iter()
-                .any(|metric| metric.under_lower_threshold(state.memory_lower_threshold))
-            {
-                state.too_many_requests.remove(model);
-                tracing::debug!(
+        let metrics = get_all_metrics(chat_completions_service_urls, model)
+            .await
+            .map_err(|e| AtomaServiceError::InternalError {
+                message: format!("Failed to get metrics for model {model}, with error: {e}"),
+                endpoint: endpoint.to_string(),
+            })?;
+        if metrics
+            .iter()
+            .any(|metric| metric.under_lower_threshold(state.memory_lower_threshold))
+        {
+            state.too_many_requests.remove(model);
+            tracing::debug!(
                     target = "atoma-service",
                     level = "debug",
                     "Model {} is in the `too_many_requests` map, but metrics indicate that it is no longer exceeding the lower threshold. Removing from the map.",
                     model
                 );
-            }
-        } else {
+        } else if !metrics.is_empty() {
+            // TODO: Should we add the model to the `too_many_requests` map here?
             tracing::debug!(
                     target = "atoma-service",
                     level = "debug",
-                    "Model is not in the `too_many_requests` map, so no action is needed here. Processing can continue."
+                    "Model {} is in the `too_many_requests` map, but metrics indicate that it is still exceeding the lower threshold. Processing can continue.",
+                    model
                 );
+            return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
+                message: "Service unavailable due to high load (metrics)".to_string(),
+                endpoint: endpoint.to_string(),
+            });
         }
         Ok(())
     }
diff --git a/atoma-service/src/tests.rs b/atoma-service/src/tests.rs
index c180ab41..05938580 100644
--- a/atoma-service/src/tests.rs
+++ b/atoma-service/src/tests.rs
@@ -319,6 +319,27 @@ mod middleware {
             .expect("Failed to remove client.yaml");
         std::fs::remove_dir_all(keystore_path.parent().unwrap())
             .expect("Failed to remove keystore");
+
+        let mut service_urls = HashMap::new();
+        service_urls.insert(
+            "meta-llama/llama-3.1-70b-instruct"
+                .to_string()
+                .to_lowercase(),
+            vec![("http://localhost:8080".to_string(), "vllm".to_string(), 1)],
+        );
+        service_urls.insert(
+            "intfloat/multilingual-e5-large-instruct"
+                .to_string()
+                .to_lowercase(),
+            vec![("http://localhost:8081".to_string(), "vllm".to_string(), 1)],
+        );
+        service_urls.insert(
+            "black-forest-labs/flux.1-schnell"
+                .to_string()
+                .to_lowercase(),
+            vec![("http://localhost:8082".to_string(), "vllm".to_string(), 1)],
+        );
+
         (
             AppState {
                 concurrent_requests_per_stack: Arc::new(DashMap::new()),
@@ -334,7 +355,7 @@ mod middleware {
                 decryption_sender,
                 encryption_sender,
                 compute_shared_secret_sender,
-                chat_completions_service_urls: HashMap::new(),
+                chat_completions_service_urls: service_urls,
                 embeddings_service_url: String::new(),
                 image_generations_service_url: String::new(),
                 keystore: Arc::new(keystore),
@@ -813,6 +834,7 @@ mod middleware {
     #[tokio::test]
     #[serial]
     async fn test_verify_stack_permissions_token_counting() {
+        setup_subscriber();
         let (
             app_state,
             _,

From 8344bf1f84d782a4c3420a1ae763bc698f50a46d Mon Sep 17 00:00:00 2001
From: Martin Stefcek <35243812+Cifko@users.noreply.github.com>
Date: Thu, 5 Jun 2025 12:02:54 +0200
Subject: [PATCH 13/13] fix: normalize model strings to lowercase in request
 handlers (#661)

* fix: normalize model strings to lowercase in request handlers

* fix test

* fix
---
 atoma-bin/atoma_node.rs                       |  9 +-
 .../src/handlers/chat_completions.rs          | 87 +++++++++----------
 atoma-service/src/handlers/completions.rs     | 87 +++++++++----------
 atoma-service/src/handlers/embeddings.rs      | 12 +--
 .../src/handlers/image_generations.rs         | 15 ++--
 atoma-service/src/handlers/mod.rs             |  8 +-
 atoma-service/src/middleware.rs               | 25 +++---
 atoma-service/src/tests.rs                    |  7 +-
 8 files changed, 127 insertions(+), 123 deletions(-)

diff --git a/atoma-bin/atoma_node.rs b/atoma-bin/atoma_node.rs
index ed234a1c..f1332950 100644
--- a/atoma-bin/atoma_node.rs
+++ b/atoma-bin/atoma_node.rs
@@ -362,7 +362,14 @@ async fn main() -> Result<()> {
         encryption_sender: app_state_encryption_sender,
         compute_shared_secret_sender,
         tokenizers: Arc::new(tokenizers),
-        models: Arc::new(config.service.models),
+        models: Arc::new(
+            config
+                .service
+                .models
+                .into_iter()
+                .map(|model| model.to_lowercase())
+                .collect(),
+        ),
         chat_completions_service_urls: config.service.chat_completions_service_urls,
         embeddings_service_url: config
             .service
diff --git a/atoma-service/src/handlers/chat_completions.rs b/atoma-service/src/handlers/chat_completions.rs
index f53815c1..cac2b5f6 100644
--- a/atoma-service/src/handlers/chat_completions.rs
+++ b/atoma-service/src/handlers/chat_completions.rs
@@ -238,7 +238,8 @@ pub async fn chat_completions_handler(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or_default();
+        .unwrap_or_default()
+        .to_lowercase();
 
     match handle_response(
         &state,
@@ -260,35 +261,33 @@ pub async fn chat_completions_handler(
         Ok(response) => {
             CHAT_COMPLETIONS_ESTIMATED_TOTAL_TOKENS.add(
                 num_input_tokens + estimated_output_tokens,
-                &[KeyValue::new(MODEL_KEY, model.to_owned())],
+                &[KeyValue::new(MODEL_KEY, model.clone())],
             );
             if !is_stream {
-                TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
             }
             Ok(response)
         }
         Err(e) => {
             match e.status_code() {
                 StatusCode::TOO_MANY_REQUESTS => {
-                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::BAD_REQUEST => {
-                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::LOCKED => {
-                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::TOO_EARLY => {
-                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::UNAUTHORIZED => {
-                    TOTAL_UNAUTHORIZED_REQUESTS
-                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_UNAUTHORIZED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 _ => {
-                    TOTAL_FAILED_CHAT_REQUESTS
-                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
-                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_FAILED_CHAT_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
             }
 
@@ -317,7 +316,7 @@ pub async fn chat_completions_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     0,
                     estimated_output_tokens,
@@ -465,10 +464,10 @@ pub async fn confidential_chat_completions_handler(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or(UNKNOWN_MODEL);
+        .unwrap_or(UNKNOWN_MODEL)
+        .to_lowercase();
 
-    CHAT_COMPLETIONS_CONFIDENTIAL_NUM_REQUESTS
-        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+    CHAT_COMPLETIONS_CONFIDENTIAL_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
 
     let endpoint = request_metadata.endpoint_path.clone();
 
@@ -492,35 +491,34 @@ pub async fn confidential_chat_completions_handler(
         Ok(response) => {
             CHAT_COMPLETIONS_ESTIMATED_TOTAL_TOKENS.add(
                 num_input_tokens + estimated_output_tokens,
-                &[KeyValue::new(MODEL_KEY, model.to_owned())],
+                &[KeyValue::new(MODEL_KEY, model.clone())],
             );
             if !is_stream {
-                TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
             }
             Ok(response)
         }
         Err(e) => {
             match e.status_code() {
                 StatusCode::TOO_MANY_REQUESTS => {
-                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::BAD_REQUEST => {
-                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::LOCKED => {
-                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::TOO_EARLY => {
-                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::UNAUTHORIZED => {
-                    TOTAL_UNAUTHORIZED_REQUESTS
-                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_UNAUTHORIZED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 _ => {
                     TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS
-                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
-                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
             }
             // NOTE: We need to update the stack number of tokens as the service failed to generate
@@ -548,7 +546,7 @@ pub async fn confidential_chat_completions_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     0,
                     estimated_output_tokens,
@@ -757,9 +755,10 @@ async fn handle_non_streaming_response(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or(UNKNOWN_MODEL);
+        .unwrap_or(UNKNOWN_MODEL)
+        .to_lowercase();
 
-    CHAT_COMPLETIONS_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+    CHAT_COMPLETIONS_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
     let timer = Instant::now();
     debug!(
         target = "atoma-service",
@@ -779,7 +778,7 @@ async fn handle_non_streaming_response(
         level = "debug",
         "Received non-streaming chat completions response from {endpoint}"
     );
-    let (input_tokens, output_tokens) = utils::extract_total_num_tokens(&response_body, model);
+    let (input_tokens, output_tokens) = utils::extract_total_num_tokens(&response_body, &model);
 
     utils::serve_non_streaming_response(
         state,
@@ -796,7 +795,7 @@ async fn handle_non_streaming_response(
         client_encryption_metadata,
         endpoint,
         timer,
-        model,
+        &model,
     )
     .await
 }
@@ -887,13 +886,14 @@ async fn handle_streaming_response(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or(UNKNOWN_MODEL);
-    CHAT_COMPLETIONS_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+        .unwrap_or(UNKNOWN_MODEL)
+        .to_lowercase();
+    CHAT_COMPLETIONS_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
     let timer = Instant::now();
 
     let chat_completions_service_urls = state
         .chat_completions_service_urls
-        .get(&model.to_lowercase())
+        .get(&model)
         .ok_or_else(|| {
             AtomaServiceError::InternalError {
                 message: format!(
@@ -907,7 +907,7 @@ async fn handle_streaming_response(
         get_best_available_chat_completions_service_url(
             &state.running_num_requests,
             chat_completions_service_urls,
-            &model.to_lowercase(),
+            &model,
             state.memory_upper_threshold,
             state.max_num_queued_requests,
         )
@@ -917,9 +917,7 @@ async fn handle_streaming_response(
             endpoint: endpoint.clone(),
         })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
-        state
-            .too_many_requests
-            .insert(model.to_string(), Instant::now());
+        state.too_many_requests.insert(model, Instant::now());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -999,7 +997,7 @@ async fn handle_streaming_response(
         payload_hash,
         state.keystore.clone(),
         state.address_index,
-        model.to_string(),
+        model.clone(),
         streaming_encryption_metadata,
         endpoint,
         request_id,
@@ -1325,10 +1323,11 @@ pub mod utils {
         let model = payload
             .get(MODEL_KEY)
             .and_then(|m| m.as_str())
-            .unwrap_or(UNKNOWN_MODEL);
+            .unwrap_or(UNKNOWN_MODEL)
+            .to_lowercase();
         let chat_completions_service_url_services = state
             .chat_completions_service_urls
-            .get(&model.to_lowercase())
+            .get(&model)
             .ok_or_else(|| {
                 AtomaServiceError::InternalError {
                     message: format!(
@@ -1342,7 +1341,7 @@ pub mod utils {
             get_best_available_chat_completions_service_url(
                 &state.running_num_requests,
                 chat_completions_service_url_services,
-                model,
+                &model,
                 state.memory_upper_threshold,
                 state.max_num_queued_requests,
             )
@@ -1354,7 +1353,7 @@ pub mod utils {
         if status_code == StatusCode::TOO_MANY_REQUESTS {
             state
                 .too_many_requests
-                .insert(model.to_string(), Instant::now());
+                .insert(model.clone(), Instant::now());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
@@ -1678,7 +1677,7 @@ pub mod utils {
                 &state.state_manager_sender,
                 user_id,
                 user_address,
-                model.to_string(),
+                model.to_owned(),
                 estimated_input_tokens,
                 input_tokens,
                 estimated_output_tokens,
diff --git a/atoma-service/src/handlers/completions.rs b/atoma-service/src/handlers/completions.rs
index 4b44ded0..4501bf9f 100644
--- a/atoma-service/src/handlers/completions.rs
+++ b/atoma-service/src/handlers/completions.rs
@@ -212,7 +212,8 @@ pub async fn completions_handler(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or_default();
+        .unwrap_or_default()
+        .to_lowercase();
 
     match handle_response(
         &state,
@@ -234,35 +235,33 @@ pub async fn completions_handler(
         Ok(response) => {
             CHAT_COMPLETIONS_ESTIMATED_TOTAL_TOKENS.add(
                 num_input_tokens + estimated_output_tokens,
-                &[KeyValue::new(MODEL_KEY, model.to_owned())],
+                &[KeyValue::new(MODEL_KEY, model.clone())],
             );
             if !is_stream {
-                TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
             }
             Ok(response)
         }
         Err(e) => {
             match e.status_code() {
                 StatusCode::TOO_MANY_REQUESTS => {
-                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::BAD_REQUEST => {
-                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::LOCKED => {
-                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::TOO_EARLY => {
-                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::UNAUTHORIZED => {
-                    TOTAL_UNAUTHORIZED_REQUESTS
-                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_UNAUTHORIZED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 _ => {
-                    TOTAL_FAILED_CHAT_REQUESTS
-                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
-                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_FAILED_CHAT_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
             }
             // NOTE: We need to update the stack number of tokens as the service failed to generate
@@ -290,7 +289,7 @@ pub async fn completions_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     0,
                     estimated_output_tokens,
@@ -438,10 +437,10 @@ pub async fn confidential_completions_handler(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or(UNKNOWN_MODEL);
+        .unwrap_or(UNKNOWN_MODEL)
+        .to_lowercase();
 
-    CHAT_COMPLETIONS_CONFIDENTIAL_NUM_REQUESTS
-        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+    CHAT_COMPLETIONS_CONFIDENTIAL_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
 
     let endpoint = request_metadata.endpoint_path.clone();
 
@@ -465,35 +464,34 @@ pub async fn confidential_completions_handler(
         Ok(response) => {
             CHAT_COMPLETIONS_ESTIMATED_TOTAL_TOKENS.add(
                 num_input_tokens + estimated_output_tokens,
-                &[KeyValue::new(MODEL_KEY, model.to_owned())],
+                &[KeyValue::new(MODEL_KEY, model.clone())],
             );
             if !is_stream {
-                TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                TOTAL_COMPLETED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
             }
             Ok(response)
         }
         Err(e) => {
             match e.status_code() {
                 StatusCode::TOO_MANY_REQUESTS => {
-                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_TOO_MANY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::BAD_REQUEST => {
-                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_BAD_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::LOCKED => {
-                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_LOCKED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::TOO_EARLY => {
-                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_TOO_EARLY_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 StatusCode::UNAUTHORIZED => {
-                    TOTAL_UNAUTHORIZED_REQUESTS
-                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                    TOTAL_UNAUTHORIZED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
                 _ => {
                     TOTAL_FAILED_CHAT_CONFIDENTIAL_REQUESTS
-                        .add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
-                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+                        .add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
+                    TOTAL_FAILED_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
                 }
             }
             if let Some(stack_small_id) = stack_small_id {
@@ -522,7 +520,7 @@ pub async fn confidential_completions_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     0,
                     estimated_output_tokens,
@@ -731,9 +729,10 @@ async fn handle_non_streaming_response(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or(UNKNOWN_MODEL);
+        .unwrap_or(UNKNOWN_MODEL)
+        .to_lowercase();
 
-    CHAT_COMPLETIONS_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+    CHAT_COMPLETIONS_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
     let timer = Instant::now();
     debug!(
         target = "atoma-service",
@@ -753,7 +752,7 @@ async fn handle_non_streaming_response(
         level = "debug",
         "Received non-streaming chat completions response from {endpoint}"
     );
-    let (input_tokens, output_tokens) = utils::extract_total_num_tokens(&response_body, model);
+    let (input_tokens, output_tokens) = utils::extract_total_num_tokens(&response_body, &model);
 
     utils::serve_non_streaming_response(
         state,
@@ -770,7 +769,7 @@ async fn handle_non_streaming_response(
         client_encryption_metadata,
         endpoint,
         timer,
-        model,
+        &model,
     )
     .await
 }
@@ -861,13 +860,14 @@ async fn handle_streaming_response(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or(UNKNOWN_MODEL);
-    CHAT_COMPLETIONS_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.to_owned())]);
+        .unwrap_or(UNKNOWN_MODEL)
+        .to_lowercase();
+    CHAT_COMPLETIONS_NUM_REQUESTS.add(1, &[KeyValue::new(MODEL_KEY, model.clone())]);
     let timer = Instant::now();
 
     let chat_completions_service_urls = state
         .chat_completions_service_urls
-        .get(&model.to_lowercase())
+        .get(&model)
         .ok_or_else(|| {
             AtomaServiceError::InternalError {
                 message: format!(
@@ -880,7 +880,7 @@ async fn handle_streaming_response(
     let (completions_service_url, status_code) = get_best_available_chat_completions_service_url(
         &state.running_num_requests,
         chat_completions_service_urls,
-        model,
+        &model,
         state.memory_upper_threshold,
         state.max_num_queued_requests,
     )
@@ -892,7 +892,7 @@ async fn handle_streaming_response(
     if status_code == StatusCode::TOO_MANY_REQUESTS {
         state
             .too_many_requests
-            .insert(model.to_string(), Instant::now());
+            .insert(model.clone(), Instant::now());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -968,7 +968,7 @@ async fn handle_streaming_response(
         payload_hash,
         state.keystore.clone(),
         state.address_index,
-        model.to_string(),
+        model.clone(),
         streaming_encryption_metadata,
         endpoint,
         request_id,
@@ -1287,10 +1287,11 @@ pub mod utils {
         let model = payload
             .get(MODEL_KEY)
             .and_then(|m| m.as_str())
-            .unwrap_or(UNKNOWN_MODEL);
+            .unwrap_or(UNKNOWN_MODEL)
+            .to_lowercase();
         let completions_service_url_services = state
             .chat_completions_service_urls
-            .get(&model.to_lowercase())
+            .get(&model)
             .ok_or_else(|| {
                 AtomaServiceError::InternalError {
                     message: format!(
@@ -1304,7 +1305,7 @@ pub mod utils {
             get_best_available_chat_completions_service_url(
                 &state.running_num_requests,
                 completions_service_url_services,
-                model,
+                &model,
                 state.memory_upper_threshold,
                 state.max_num_queued_requests,
             )
@@ -1314,9 +1315,7 @@ pub mod utils {
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
-            state
-                .too_many_requests
-                .insert(model.to_string(), Instant::now());
+            state.too_many_requests.insert(model, Instant::now());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
@@ -1643,7 +1642,7 @@ pub mod utils {
                 &state.state_manager_sender,
                 user_id,
                 user_address,
-                model.to_string(),
+                model.to_owned(),
                 num_input_tokens,
                 input_tokens,
                 estimated_output_tokens,
diff --git a/atoma-service/src/handlers/embeddings.rs b/atoma-service/src/handlers/embeddings.rs
index 7f3ba54d..25acd20d 100644
--- a/atoma-service/src/handlers/embeddings.rs
+++ b/atoma-service/src/handlers/embeddings.rs
@@ -98,7 +98,8 @@ pub async fn embeddings_handler(
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
         .unwrap_or("unknown")
-        .to_string();
+        .to_string()
+        .to_lowercase();
 
     let RequestMetadata {
         stack_small_id,
@@ -181,7 +182,7 @@ pub async fn embeddings_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     0,
                     estimated_output_tokens,
@@ -263,7 +264,8 @@ pub async fn confidential_embeddings_handler(
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
         .unwrap_or("unknown")
-        .to_string();
+        .to_string()
+        .to_lowercase();
 
     TEXT_EMBEDDINGS_CONFIDENTIAL_NUM_REQUESTS
         .add(1, &[KeyValue::new("model", model.as_str().to_owned())]);
@@ -321,7 +323,7 @@ pub async fn confidential_embeddings_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     num_input_tokens,
                     estimated_output_tokens,
@@ -374,7 +376,7 @@ pub async fn confidential_embeddings_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     0,
                     estimated_output_tokens,
diff --git a/atoma-service/src/handlers/image_generations.rs b/atoma-service/src/handlers/image_generations.rs
index b1fb79c4..b66e6bcc 100644
--- a/atoma-service/src/handlers/image_generations.rs
+++ b/atoma-service/src/handlers/image_generations.rs
@@ -100,9 +100,10 @@ pub async fn image_generations_handler(
     let model = payload
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
-        .unwrap_or("unknown");
+        .unwrap_or("unknown")
+        .to_lowercase();
 
-    IMAGE_GEN_NUM_REQUESTS.add(1, &[KeyValue::new("model", model.to_owned())]);
+    IMAGE_GEN_NUM_REQUESTS.add(1, &[KeyValue::new("model", model.clone())]);
     let timer = Instant::now();
 
     let RequestMetadata {
@@ -122,7 +123,7 @@ pub async fn image_generations_handler(
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
         .unwrap_or("unknown")
-        .to_string();
+        .to_lowercase();
 
     match handle_image_generations_response(
         &state,
@@ -182,7 +183,7 @@ pub async fn image_generations_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     0,
                     estimated_output_tokens,
@@ -265,7 +266,7 @@ pub async fn confidential_image_generations_handler(
         .get(MODEL_KEY)
         .and_then(|m| m.as_str())
         .unwrap_or("unknown")
-        .to_string();
+        .to_lowercase();
 
     IMAGE_GEN_CONFIDENTIAL_NUM_REQUESTS
         .add(1, &[KeyValue::new("model", model.as_str().to_owned())]);
@@ -317,7 +318,7 @@ pub async fn confidential_image_generations_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     num_input_tokens,
                     estimated_output_tokens,
@@ -370,7 +371,7 @@ pub async fn confidential_image_generations_handler(
                     &state.state_manager_sender,
                     user_id,
                     user_address,
-                    model.to_string(),
+                    model.clone(),
                     num_input_tokens,
                     0,
                     estimated_output_tokens,
diff --git a/atoma-service/src/handlers/mod.rs b/atoma-service/src/handlers/mod.rs
index ed5dadab..fd6e04d4 100644
--- a/atoma-service/src/handlers/mod.rs
+++ b/atoma-service/src/handlers/mod.rs
@@ -931,7 +931,7 @@ pub mod inference_service_metrics {
                 "No chat completions service URLs provided for model."
             );
             return Err(
-                ChatCompletionsMetricsError::NoChatCompletionsServiceUrlsFound(model.to_string()),
+                ChatCompletionsMetricsError::NoChatCompletionsServiceUrlsFound(model.to_owned()),
             );
         }
         tracing::debug!(
@@ -976,7 +976,7 @@ pub mod inference_service_metrics {
                 .iter()
                 .map(|(url, job, max_concurrent_requests)| {
                     (
-                        model.to_string(),
+                        model.to_owned(),
                         url.clone(),
                         job.clone(),
                         *max_concurrent_requests,
@@ -1009,7 +1009,7 @@ pub mod inference_service_metrics {
                 .iter()
                 .map(|(url, job, max_concurrent_requests)| {
                     (
-                        model.to_string(),
+                        model.to_owned(),
                         url.clone(),
                         job.clone(),
                         *max_concurrent_requests,
@@ -1048,7 +1048,7 @@ pub mod inference_service_metrics {
                         "current_model = {current_model}, model = {model}, they are equal = {}",
                         current_model == model
                     );
-                    if current_model.to_lowercase() != model.to_lowercase() {
+                    if current_model != model {
                         // NOTE: We only want to consider metrics for the current model
                         continue;
                     }
diff --git a/atoma-service/src/middleware.rs b/atoma-service/src/middleware.rs
index e2bd608e..8a00b872 100644
--- a/atoma-service/src/middleware.rs
+++ b/atoma-service/src/middleware.rs
@@ -603,7 +603,7 @@ async fn generate_fiat_request(
     state
         .state_manager_sender
         .send(AtomaAtomaStateManagerEvent::GetModelPricing {
-            model: model.to_string(),
+            model: model.to_owned(),
             result_sender,
         })
         .map_err(|err| AtomaServiceError::InternalError {
@@ -817,20 +817,22 @@ pub async fn verify_permissions(
         .ok_or_else(|| AtomaServiceError::InvalidBody {
             message: "Model is not a string".to_string(),
             endpoint: endpoint.clone(),
-        })?;
-    utils::check_if_too_many_requests(&state, model, &endpoint).await?;
-    if !state.models.contains(&model.to_string()) {
+        })?
+        .to_lowercase();
+    if !state.models.contains(&model) {
         return Err(AtomaServiceError::InvalidBody {
             message: format!("Model not supported, supported models: {:?}", state.models),
             endpoint: endpoint.clone(),
         });
     }
 
+    utils::check_if_too_many_requests(&state, &model, &endpoint).await?;
+
     let TokensEstimate {
         num_input_tokens,
         max_output_tokens,
         max_total_tokens,
-    } = utils::calculate_tokens(&body_json, request_type, &state, model, &endpoint)?;
+    } = utils::calculate_tokens(&body_json, request_type, &state, &model, &endpoint)?;
 
     let max_total_tokens = max_total_tokens as i64;
     let max_output_tokens = max_output_tokens as i64;
@@ -865,7 +867,7 @@ pub async fn verify_permissions(
             req_parts,
             request_type,
             body_bytes,
-            model,
+            &model,
             instant,
         )
         .await?
@@ -1658,7 +1660,7 @@ pub mod utils {
         model: &str,
         endpoint: &str,
     ) -> Result<(), AtomaServiceError> {
-        match state.too_many_requests.entry(model.to_string()) {
+        match state.too_many_requests.entry(model.to_owned()) {
             dashmap::mapref::entry::Entry::Occupied(occupied_entry) => {
                 let elapsed_ms = occupied_entry.get().elapsed().as_millis();
 
@@ -1686,7 +1688,7 @@ pub mod utils {
         }
         let chat_completions_service_urls = state
                 .chat_completions_service_urls
-                .get(&model.to_lowercase())
+                .get(model)
                 .ok_or_else(|| {
                     AtomaServiceError::InvalidBody {
                         message: format!(
@@ -1706,19 +1708,18 @@ pub mod utils {
             .iter()
             .any(|metric| metric.under_lower_threshold(state.memory_lower_threshold))
         {
-            state.too_many_requests.remove(model);
             tracing::debug!(
                     target = "atoma-service",
                     level = "debug",
-                    "Model {} is in the `too_many_requests` map, but metrics indicate that it is no longer exceeding the lower threshold. Removing from the map.",
+                    "Model {} is not in the `too_many_requests` map, but metrics indicate that it is no longer exceeding the lower threshold. Removing from the map.",
                     model
                 );
         } else if !metrics.is_empty() {
-            // TODO: Should we add the model to the `too_many_requests` map here?
+            // TODO: Should we add the model to the `too_many_requests` map here? It means that the service is either dead, or we are restarting it.
             tracing::debug!(
                     target = "atoma-service",
                     level = "debug",
-                    "Model {} is in the `too_many_requests` map, but metrics indicate that it is still exceeding the lower threshold. Processing can continue.",
+                    "Model {} is not in the `too_many_requests` map, but metrics indicate that it is still exceeding the lower threshold. Processing can continue.",
                     model
                 );
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
diff --git a/atoma-service/src/tests.rs b/atoma-service/src/tests.rs
index 05938580..b94ad8e2 100644
--- a/atoma-service/src/tests.rs
+++ b/atoma-service/src/tests.rs
@@ -344,12 +344,7 @@ mod middleware {
             AppState {
                 concurrent_requests_per_stack: Arc::new(DashMap::new()),
                 client_dropped_streamer_connections: Arc::new(DashSet::new()),
-                models: Arc::new(
-                    models
-                        .into_iter()
-                        .map(std::string::ToString::to_string)
-                        .collect(),
-                ),
+                models: Arc::new(models.into_iter().map(str::to_lowercase).collect()),
                 tokenizers: Arc::new(vec![Arc::new(tokenizer.clone()), Arc::new(tokenizer)]),
                 state_manager_sender,
                 decryption_sender,