3232from utils .data import make_url_sassy
3333from utils .email import codalab_send_markdown_email
3434
35+ from channels .layers import get_channel_layer
36+ from asgiref .sync import async_to_sync
37+
3538import logging
3639logger = logging .getLogger (__name__ )
3740
@@ -784,9 +787,66 @@ def submission_status_cleanup():
784787 submissions = Submission .objects .filter (status = Submission .RUNNING , has_children = False ).select_related ('phase' , 'parent' )
785788
786789 for sub in submissions :
787- # Check if the submission has been running for 24 hours longer than execution_time_limit
788790 if sub .started_when < now () - timedelta (milliseconds = (3600000 * 24 ) + sub .phase .execution_time_limit ):
789791 if sub .parent is not None :
790792 sub .parent .cancel (status = Submission .FAILED )
791793 else :
792794 sub .cancel (status = Submission .FAILED )
795+
796+
797+ def _broadcast_worker_state (payload ):
798+ channel_layer = get_channel_layer ()
799+ if not channel_layer :
800+ return
801+
802+ async_to_sync (channel_layer .group_send )(
803+ "compute_workers" ,
804+ {
805+ "type" : "worker.health" ,
806+ "worker" : payload ,
807+ },
808+ )
809+
810+
811+ @app .task (queue = "site-worker" , soft_time_limit = 60 )
812+ def refresh_compute_worker_health ():
813+ celery_app = app_or_default ()
814+ inspector = celery_app .control .inspect (timeout = 1 )
815+
816+ if inspector is None :
817+ logger .warning ("Celery inspect returned None" )
818+ return
819+
820+ try :
821+ stats = inspector .stats () or {}
822+ active = inspector .active () or {}
823+ reserved = inspector .reserved () or {}
824+ except Exception :
825+ logger .exception ("Unable to inspect Celery workers" )
826+ return
827+
828+ for worker_name in stats .keys ():
829+ if not worker_name .startswith ("compute-worker" ):
830+ continue
831+
832+ raw_running_jobs = len (active .get (worker_name , [])) + len (reserved .get (worker_name , []))
833+ status = "busy" if raw_running_jobs > 0 else "available"
834+
835+ payload = {
836+ "hostname" : worker_name ,
837+ "status" : status ,
838+ "running_jobs" : raw_running_jobs ,
839+ "timestamp" : now ().timestamp (),
840+ }
841+
842+ r .set (f"worker:{ worker_name } :heartbeat" , json .dumps (payload ), ex = 35 )
843+ r .hset (
844+ WORKERS_REGISTRY_KEY ,
845+ worker_name ,
846+ json .dumps ({
847+ "hostname" : worker_name ,
848+ "last_seen" : payload ["timestamp" ],
849+ }),
850+ )
851+
852+ _broadcast_worker_state (payload )
0 commit comments