AtomaAI
diff --git a/‎atoma-bin/atoma_node.rs‎
Lines changed: 3 additions & 1 deletion b/‎atoma-bin/atoma_node.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎atoma-service/src/config.rs‎
Lines changed: 5 additions & 0 deletions b/‎atoma-service/src/config.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎atoma-service/src/handlers/chat_completions.rs‎
Lines changed: 4 additions & 6 deletions b/‎atoma-service/src/handlers/chat_completions.rs‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎atoma-service/src/handlers/completions.rs‎
Lines changed: 4 additions & 6 deletions b/‎atoma-service/src/handlers/completions.rs‎
Lines changed: 4 additions & 6 deletions
@@ -373,9 +373,11 @@ async fn main() -> Result<()> {
         keystore: Arc::new(keystore),
         address_index,
         whitelist_sui_addresses_for_fiat: config.service.whitelist_sui_addresses_for_fiat,
-        too_many_requests: Arc::new(DashMap::new()),
+        too_many_requests: Arc::new(DashSet::new()),
         too_many_requests_timeout_ms: u128::from(config.service.too_many_requests_timeout_ms),
         running_num_requests: Arc::new(RequestCounter::new()),
+        memory_lower_threshold: config.service.memory_lower_threshold,
+        memory_upper_threshold: config.service.memory_upper_threshold,
     };
 
     let chat_completions_service_urls = app_state
 
@@ -60,6 +60,11 @@ pub struct AtomaServiceConfig {
 
     /// The timeout for the too many requests error in milliseconds.
     pub too_many_requests_timeout_ms: u64,
+
+    ///Lower threshold for memory usage, if the memory usage goes below this value, the service will not be considered overloaded
+    pub memory_lower_threshold: f64,
+    /// Upper threshold for memory usage, if the memory usage goes above this value, the service will be considered overloaded
+    pub memory_upper_threshold: f64,
 }
 
 impl AtomaServiceConfig {
 
@@ -908,16 +908,15 @@ async fn handle_streaming_response(
             &state.running_num_requests,
             chat_completions_service_urls,
             &model.to_lowercase(),
+            state.memory_upper_threshold,
         )
         .await
         .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: e.to_string(),
             endpoint: endpoint.clone(),
         })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
-        state
-            .too_many_requests
-            .insert(model.to_string(), Instant::now());
+        state.too_many_requests.insert(model.to_string());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -1341,16 +1340,15 @@ pub mod utils {
                 &state.running_num_requests,
                 chat_completions_service_url_services,
                 model,
+                state.memory_upper_threshold,
             )
             .await
             .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: e.to_string(),
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
-            state
-                .too_many_requests
-                .insert(model.to_string(), Instant::now());
+            state.too_many_requests.insert(model.to_string());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),
 
@@ -881,16 +881,15 @@ async fn handle_streaming_response(
         &state.running_num_requests,
         chat_completions_service_urls,
         model,
+        state.memory_upper_threshold,
     )
     .await
     .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
         message: e.to_string(),
         endpoint: endpoint.clone(),
     })?;
     if status_code == StatusCode::TOO_MANY_REQUESTS {
-        state
-            .too_many_requests
-            .insert(model.to_string(), Instant::now());
+        state.too_many_requests.insert(model.to_string());
         return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
             message: "Too many requests".to_string(),
             endpoint: endpoint.clone(),
@@ -1303,16 +1302,15 @@ pub mod utils {
                 &state.running_num_requests,
                 completions_service_url_services,
                 model,
+                state.memory_upper_threshold,
             )
             .await
             .map_err(|e| AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: e.to_string(),
                 endpoint: endpoint.to_string(),
             })?;
         if status_code == StatusCode::TOO_MANY_REQUESTS {
-            state
-                .too_many_requests
-                .insert(model.to_string(), Instant::now());
+            state.too_many_requests.insert(model.to_string());
             return Err(AtomaServiceError::ChatCompletionsServiceUnavailable {
                 message: "Too many requests".to_string(),
                 endpoint: endpoint.to_string(),