2727from renku_data_services .base_models .metrics import MetricsEvent
2828from renku_data_services .data_tasks .dependencies import DependencyManager
2929from renku_data_services .data_tasks .taskman import TaskDefininions
30+ from renku_data_services .k8s .models import K8sObject , K8sObjectFilter
3031from renku_data_services .namespace .models import NamespaceKind
32+ from renku_data_services .notebooks .constants import AMALTHEA_SESSION_GVK
33+ from renku_data_services .notifications .models import UnsavedAlert
3134from renku_data_services .solr .solr_client import DefaultSolrClient
35+ from renku_data_services .utils .core import get_nonzero_minimum
3236
3337logger = logging .getLogger (__name__ )
3438
@@ -122,8 +126,8 @@ async def sync_user_namespaces(dm: DependencyManager) -> None:
122126 await dm .authz .client .WriteRelationships (authz_change .apply )
123127 num_authz += 1
124128 except Exception as err :
125- # NOTE: We do not rollback the authz changes here because it is OK if something is in Authz DB
126- # but not in the message queue but not vice- versa.
129+ # NOTE: We do not roll back the authz changes here because it is OK if something is in Authz DB
130+ # but not in the message queue but not vice versa.
127131 logger .error (f"Failed to sync user namespace { user_namespace } because { err } " )
128132 await tx .rollback ()
129133 else :
@@ -442,6 +446,121 @@ async def record_resource_requests(dm: DependencyManager) -> None:
442446 await asyncio .sleep (interval_seconds )
443447
444448
449+ def _extract_session_quota_metadata (session : K8sObject ) -> tuple [str , str , int , int | None ] | None :
450+ """Extract session name, user id, resource pool id and resource class id from an AmaltheaSession object."""
451+ manifest = session .manifest
452+
453+ state = str (manifest .get ("status" , {}).get ("state" , "" )).lower ()
454+ if state != "running" :
455+ return None
456+
457+ labels = manifest .get ("metadata" , {}).get ("labels" , {})
458+ user_id = labels .get ("renku.io/safe-username" )
459+ if not user_id :
460+ return None
461+
462+ annotations = manifest .get ("metadata" , {}).get ("annotations" , {})
463+ resource_pool_id_raw = annotations .get ("renku.io/resource_pool_id" )
464+ if resource_pool_id_raw is None :
465+ return None
466+ resource_pool_id = int (resource_pool_id_raw )
467+
468+ resource_class_id_raw = annotations .get ("renku.io/resource_class_id" )
469+ resource_class_id = int (resource_class_id_raw ) if resource_class_id_raw else None
470+
471+ return session .name , user_id , resource_pool_id , resource_class_id
472+
473+
474+ async def _check_session_quota_and_send_alerts (dm : DependencyManager ) -> None :
475+ """Check all active sessions and send alerts if the remaining user quota is below threshold."""
476+ admin_user = InternalServiceAdmin (id = ServiceAdminId .capacity_reservation )
477+ session_filter = K8sObjectFilter (gvk = AMALTHEA_SESSION_GVK )
478+
479+ async for session in dm .k8s_client .list (session_filter ):
480+ try :
481+ metadata = _extract_session_quota_metadata (session )
482+ if not metadata :
483+ continue
484+
485+ session_name , user_id , resource_pool_id , resource_class_id = metadata
486+
487+ usage = await dm .resource_usage_service .get_running_week (resource_pool_id = resource_pool_id , user_id = user_id )
488+ if not usage :
489+ continue
490+
491+ total_quota = get_nonzero_minimum (usage .pool_limits .user_limit .value , usage .pool_limits .total_limit .value )
492+ if total_quota <= 0 :
493+ continue
494+
495+ usage_p = (usage .user_usage .cost .value / total_quota ) * 100
496+ usage_threshold_p = 100 - dm .config .session_quota_alert_remaining_threshold_p
497+ if usage_p <= usage_threshold_p :
498+ continue
499+
500+ # NOTE: Without the resource_class_id, we cannot calculate the remaining time
501+ if resource_class_id :
502+ if usage .user_usage .cost .value >= total_quota :
503+ message = f"Your session { session_name } in resource pool { resource_pool_id } has exhausted its quota"
504+ log_message = (
505+ f"Session { session_name } for user { user_id } has exhausted its quota in resource pool "
506+ f"{ resource_pool_id } "
507+ )
508+ hibernation_alert = UnsavedAlert (
509+ user_id = user_id ,
510+ event_type = "session_quota_exhausted" ,
511+ session_name = session_name ,
512+ title = "Session paused due to quota exhaustion" ,
513+ message = message ,
514+ )
515+ await dm .notifications_repo .create_or_update_alert (user = admin_user , alert = hibernation_alert )
516+ logger .info (log_message )
517+ continue
518+
519+ resource_class_cost = await dm .resource_requests_repo .find_resource_class_costs (
520+ resource_pool_id , resource_class_id
521+ )
522+
523+ if resource_class_cost and resource_class_cost .cost .value > 0 :
524+ remaining_credits = total_quota - usage .user_usage .cost .value
525+ remaining_minutes = (remaining_credits / resource_class_cost .cost .value ) * 60
526+ if remaining_minutes < dm .config .session_quota_alert_critical_m :
527+ critical_alert = UnsavedAlert (
528+ user_id = user_id ,
529+ event_type = "session_quota_critically_low" ,
530+ session_name = session_name ,
531+ title = "Session quota expiring soon" ,
532+ message = f"Your session in resource pool { resource_pool_id } will run out of quota in "
533+ + f"approximately { remaining_minutes :.0f} minutes." ,
534+ )
535+ await dm .notifications_repo .create_or_update_alert (user = admin_user , alert = critical_alert )
536+ logger .info (f"Created critical quota alert for user { user_id } , session { session_name } " )
537+ continue
538+
539+ alert = UnsavedAlert (
540+ user_id = user_id ,
541+ event_type = "session_quota_low" ,
542+ session_name = session_name ,
543+ title = "Session quota running low" ,
544+ message = f"You have used { usage_p :.1f} % of your quota in resource pool { resource_pool_id } ." ,
545+ )
546+ await dm .notifications_repo .create_or_update_alert (user = admin_user , alert = alert )
547+ logger .info (f"Created quota alert for user { user_id } , session { session_name } " )
548+ except Exception as e :
549+ logger .warning (f"Failed to check quota for pod: { e } " )
550+ continue
551+
552+
553+ async def monitor_session_quota_and_send_alerts (dm : DependencyManager ) -> None :
554+ """Periodically check session quotas and send alerts when the remaining quota is low."""
555+ while True :
556+ try :
557+ await _check_session_quota_and_send_alerts (dm )
558+ except (asyncio .CancelledError , KeyboardInterrupt ) as e :
559+ logger .warning (f"Exiting: { e } " )
560+ else :
561+ await asyncio .sleep (dm .config .session_quota_alert_check_interval_s )
562+
563+
445564def all_tasks (dm : DependencyManager ) -> TaskDefininions :
446565 """A dict of task factories to be managed in main."""
447566 # Impl. note: We pass the entire config to the coroutines, because
@@ -467,5 +586,6 @@ def all_tasks(dm: DependencyManager) -> TaskDefininions:
467586 "monitor_capacity_reservations" : lambda : monitor_capacity_reservations (dm ),
468587 "cleanup_orphaned_capacity_reservations" : lambda : cleanup_orphaned_capacity_reservations (dm ),
469588 "record_resource_requests" : lambda : record_resource_requests (dm ),
589+ "monitor_session_quota_and_send_alerts" : lambda : monitor_session_quota_and_send_alerts (dm ),
470590 }
471591 )
0 commit comments