@@ -101,6 +101,14 @@ def to_status(self) -> "RunStatus":
101101 }
102102 return mapping [self ]
103103
104+ def to_error (self ) -> Optional [str ]:
105+ if self == RunTerminationReason .RETRY_LIMIT_EXCEEDED :
106+ return "retry limit exceeded"
107+ elif self == RunTerminationReason .SERVER_ERROR :
108+ return "server error"
109+ else :
110+ return None
111+
104112
105113class JobTerminationReason (str , Enum ):
106114 # Set by the server
@@ -162,6 +170,24 @@ def to_retry_event(self) -> Optional[RetryEvent]:
162170 default = RetryEvent .ERROR if self .to_status () == JobStatus .FAILED else None
163171 return mapping .get (self , default )
164172
173+ def to_error (self ) -> Optional [str ]:
174+ # Should return None for values that are already
175+ # handled and shown in status_message.
176+ error_mapping = {
177+ JobTerminationReason .INSTANCE_UNREACHABLE : "instance unreachable" ,
178+ JobTerminationReason .WAITING_INSTANCE_LIMIT_EXCEEDED : "waiting instance limit exceeded" ,
179+ JobTerminationReason .VOLUME_ERROR : "volume error" ,
180+ JobTerminationReason .GATEWAY_ERROR : "gateway error" ,
181+ JobTerminationReason .SCALED_DOWN : "scaled down" ,
182+ JobTerminationReason .INACTIVITY_DURATION_EXCEEDED : "inactivity duration exceeded" ,
183+ JobTerminationReason .TERMINATED_DUE_TO_UTILIZATION_POLICY : "utilization policy" ,
184+ JobTerminationReason .PORTS_BINDING_FAILED : "ports binding failed" ,
185+ JobTerminationReason .CREATING_CONTAINER_ERROR : "runner error" ,
186+ JobTerminationReason .EXECUTOR_ERROR : "executor error" ,
187+ JobTerminationReason .MAX_DURATION_EXCEEDED : "max duration exceeded" ,
188+ }
189+ return error_mapping .get (self )
190+
165191
166192class Requirements (CoreModel ):
167193 # TODO: Make requirements' fields required
@@ -305,13 +331,12 @@ class JobSubmission(CoreModel):
305331 finished_at : Optional [datetime ]
306332 inactivity_secs : Optional [int ]
307333 status : JobStatus
334+ status_message : str = "" # default for backward compatibility
308335 termination_reason : Optional [JobTerminationReason ]
309336 termination_reason_message : Optional [str ]
310337 exit_status : Optional [int ]
311338 job_provisioning_data : Optional [JobProvisioningData ]
312339 job_runtime_data : Optional [JobRuntimeData ]
313- # TODO: make status_message and error a computed field after migrating to pydanticV2
314- status_message : Optional [str ] = None
315340 error : Optional [str ] = None
316341
317342 @property
@@ -325,71 +350,11 @@ def duration(self) -> timedelta:
325350 end_time = self .finished_at
326351 return end_time - self .submitted_at
327352
328- def dict (self , * args , ** kwargs ) -> Dict :
329- status_message = self ._get_status_message ()
330- error = self ._get_error ()
331- # super() does not work with pydantic-duality
332- res = CoreModel .dict (self , * args , ** kwargs )
333- res ["status_message" ] = status_message
334- res ["error" ] = error
335- return res
336-
337- def _get_status_message (self ) -> Optional [str ]:
338- if self .status == JobStatus .DONE :
339- return "exited (0)"
340- elif self .status == JobStatus .FAILED :
341- if self .termination_reason == JobTerminationReason .CONTAINER_EXITED_WITH_ERROR :
342- return f"exited ({ self .exit_status } )"
343- elif (
344- self .termination_reason == JobTerminationReason .FAILED_TO_START_DUE_TO_NO_CAPACITY
345- ):
346- return "no offers"
347- elif self .termination_reason == JobTerminationReason .INTERRUPTED_BY_NO_CAPACITY :
348- return "interrupted"
349- else :
350- return "error"
351- elif self .status == JobStatus .TERMINATED :
352- if self .termination_reason == JobTerminationReason .TERMINATED_BY_USER :
353- return "stopped"
354- elif self .termination_reason == JobTerminationReason .ABORTED_BY_USER :
355- return "aborted"
356- return self .status .value
357-
358- def _get_error (self ) -> Optional [str ]:
359- return JobSubmission ._termination_reason_to_error (
360- termination_reason = self .termination_reason
361- )
362-
363- @staticmethod
364- def _termination_reason_to_error (
365- termination_reason : Optional [JobTerminationReason ],
366- ) -> Optional [str ]:
367- error_mapping = {
368- JobTerminationReason .INSTANCE_UNREACHABLE : "instance unreachable" ,
369- JobTerminationReason .WAITING_INSTANCE_LIMIT_EXCEEDED : "waiting instance limit exceeded" ,
370- JobTerminationReason .VOLUME_ERROR : "volume error" ,
371- JobTerminationReason .GATEWAY_ERROR : "gateway error" ,
372- JobTerminationReason .SCALED_DOWN : "scaled down" ,
373- JobTerminationReason .INACTIVITY_DURATION_EXCEEDED : "inactivity duration exceeded" ,
374- JobTerminationReason .TERMINATED_DUE_TO_UTILIZATION_POLICY : "utilization policy" ,
375- JobTerminationReason .PORTS_BINDING_FAILED : "ports binding failed" ,
376- JobTerminationReason .CREATING_CONTAINER_ERROR : "runner error" ,
377- JobTerminationReason .EXECUTOR_ERROR : "executor error" ,
378- JobTerminationReason .MAX_DURATION_EXCEEDED : "max duration exceeded" ,
379- }
380- return error_mapping .get (termination_reason )
381-
382353
383354class Job (CoreModel ):
384355 job_spec : JobSpec
385356 job_submissions : List [JobSubmission ]
386357
387- def get_last_termination_reason (self ) -> Optional [JobTerminationReason ]:
388- for submission in reversed (self .job_submissions ):
389- if submission .termination_reason is not None :
390- return submission .termination_reason
391- return None
392-
393358
394359class RunSpec (CoreModel ):
395360 # TODO: run_name, working_dir are redundant here since they already passed in configuration
@@ -519,72 +484,17 @@ class Run(CoreModel):
519484 submitted_at : datetime
520485 last_processed_at : datetime
521486 status : RunStatus
522- status_message : Optional [ str ] = None
487+ status_message : str = "" # default for backward compatibility
523488 termination_reason : Optional [RunTerminationReason ] = None
524489 run_spec : RunSpec
525490 jobs : List [Job ]
526491 latest_job_submission : Optional [JobSubmission ] = None
527492 cost : float = 0
528493 service : Optional [ServiceSpec ] = None
529494 deployment_num : int = 0 # default for compatibility with pre-0.19.14 servers
530- # TODO: make error a computed field after migrating to pydanticV2
531495 error : Optional [str ] = None
532496 deleted : Optional [bool ] = None
533497
534- def dict (self , * args , ** kwargs ) -> Dict :
535- status_message = self ._get_status_message ()
536- error = self ._get_error ()
537- # super() does not work with pydantic-duality
538- res = CoreModel .dict (self , * args , ** kwargs )
539- res ["status_message" ] = status_message
540- res ["error" ] = error
541- return res
542-
543- def _get_error (self ) -> Optional [str ]:
544- return Run ._termination_reason_to_error (termination_reason = self .termination_reason )
545-
546- @staticmethod
547- def _termination_reason_to_error (
548- termination_reason : Optional [RunTerminationReason ],
549- ) -> Optional [str ]:
550- if termination_reason == RunTerminationReason .RETRY_LIMIT_EXCEEDED :
551- return "retry limit exceeded"
552- elif termination_reason == RunTerminationReason .SERVER_ERROR :
553- return "server error"
554- else :
555- return None
556-
557- def _get_status_message (self ) -> Optional [str ]:
558- if len (self .jobs ) == 0 :
559- return self .status .value
560-
561- last_job = self .jobs [0 ]
562- # FIXME: status_message should not require all job submissions for status calculation
563- # since it's very expensive and is not required for anything else.
564- # May return a different status if not all job submissions requested.
565- # TODO: Calculate status_message by looking at job models directly instead job submissions.
566- last_job_termination_reason = last_job .get_last_termination_reason ()
567-
568- if len (self .jobs ) == 1 :
569- # FIXME: Clarify why show "pulling" only in case of one job
570- if (
571- last_job .job_submissions
572- and last_job .job_submissions [- 1 ].status == JobStatus .PULLING
573- ):
574- return "pulling"
575-
576- retry_on_events = last_job .job_spec .retry .on_events if last_job .job_spec .retry else []
577- # Currently, `retrying` is shown only for `no-capacity` events
578- if (
579- self .status in [RunStatus .SUBMITTED , RunStatus .PENDING ]
580- and last_job_termination_reason
581- == JobTerminationReason .FAILED_TO_START_DUE_TO_NO_CAPACITY
582- and RetryEvent .NO_CAPACITY in retry_on_events
583- ):
584- return "retrying"
585-
586- return self .status .value
587-
588498 def is_deployment_in_progress (self ) -> bool :
589499 return any (
590500 not j .job_submissions [- 1 ].status .is_finished ()
0 commit comments