Skip to content

Commit 6c9e60d

Browse files
peterschmidt85haydnli-shopify
authored andcommitted
[UX] Show status message as retrying in case a run or job is being retired (dstackai#2758)
1 parent 3fb80c8 commit 6c9e60d

9 files changed

Lines changed: 70 additions & 14 deletions

File tree

frontend/src/libs/run.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@ import { get as _get } from 'lodash';
22
import { StatusIndicatorProps } from '@cloudscape-design/components';
33

44
import { capitalize } from 'libs';
5+
import { finishedRunStatuses } from '../pages/Runs/constants';
56

67
import { IModelExtended } from '../pages/Models/List/types';
78

89
export const getStatusIconType = (
910
status: IRun['status'] | TJobStatus,
1011
terminationReason: string | null | undefined,
1112
): StatusIndicatorProps['type'] => {
12-
if (terminationReason === 'interrupted_by_no_capacity') {
13+
if (finishedRunStatuses.includes(status) && terminationReason === 'interrupted_by_no_capacity') {
1314
return 'stopped';
1415
}
1516
switch (status) {
@@ -41,24 +42,26 @@ export const getStatusIconColor = (
4142
if (terminationReason === 'failed_to_start_due_to_no_capacity' || terminationReason === 'interrupted_by_no_capacity') {
4243
return 'yellow';
4344
}
44-
4545
switch (status) {
46+
case 'submitted':
47+
case 'pending':
48+
return 'blue';
4649
case 'pulling':
4750
return 'green';
4851
case 'aborted':
4952
return 'yellow';
5053
case 'done':
51-
return 'blue';
54+
return 'grey';
5255
default:
5356
return undefined;
5457
}
5558
};
5659

5760
export const getRunStatusMessage = (run: IRun): string => {
58-
if (run.latest_job_submission?.status_message) {
61+
if (finishedRunStatuses.includes(run.status) && run.latest_job_submission?.status_message) {
5962
return capitalize(run.latest_job_submission.status_message);
6063
} else {
61-
return capitalize(run.status);
64+
return capitalize(run.status_message || run.status);
6265
}
6366
};
6467

frontend/src/pages/Runs/Details/RunDetails/index.tsx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import { Logs } from '../Logs';
2424
import { getJobSubmissionId } from '../Logs/helpers';
2525

2626
import styles from './styles.module.scss';
27+
import { finishedRunStatuses } from 'pages/Runs/constants';
2728

2829
export const RunDetails = () => {
2930
const { t } = useTranslation();
@@ -47,8 +48,8 @@ export const RunDetails = () => {
4748

4849
if (!runData) return null;
4950

50-
const status = runData.latest_job_submission?.status ?? runData.status;
51-
const terminationReason = runData.latest_job_submission?.termination_reason;
51+
const status = finishedRunStatuses.includes(runData.status) ? runData.latest_job_submission?.status ?? runData.status : runData.status;
52+
const terminationReason = finishedRunStatuses.includes(runData.status) ? runData.latest_job_submission?.termination_reason : null;
5253

5354
return (
5455
<>

frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import {
1616
getRunListItemResources,
1717
getRunListItemSpotLabelKey,
1818
} from '../helpers';
19+
import { finishedRunStatuses } from 'pages/Runs/constants';
1920

2021
export const useColumnsDefinitions = () => {
2122
const { t } = useTranslation();
@@ -65,8 +66,8 @@ export const useColumnsDefinitions = () => {
6566
id: 'status',
6667
header: t('projects.run.status'),
6768
cell: (item: IRun) => {
68-
const status = item.latest_job_submission?.status ?? item.status;
69-
const terminationReason = item.latest_job_submission?.termination_reason;
69+
const status = finishedRunStatuses.includes(item.status) ? item.latest_job_submission?.status ?? item.status : item.status;
70+
const terminationReason = finishedRunStatuses.includes(item.status) ? item.latest_job_submission?.termination_reason : null;
7071

7172
return (
7273
<StatusIndicator

frontend/src/pages/Runs/constants.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@ export const runStatusForStopping: TJobStatus[] = ['submitted', 'provisioning',
44
export const runStatusForAborting: TJobStatus[] = ['submitted', 'provisioning', 'pulling', 'pending', 'running'];
55
export const unfinishedRuns: TJobStatus[] = ['running', 'terminating', 'pending'];
66
export const finishedJobs: TJobStatus[] = ['terminated', 'aborted', 'failed', 'done'];
7+
// TODO: Replace TJobStatus with TRunStatus and remove all consts above
8+
export const finishedRunStatuses: TJobStatus[] = ['done', 'failed', 'terminated'];

frontend/src/types/run.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ declare interface IRun {
173173
latest_job_submission?: IJobSubmission;
174174
cost: number;
175175
service: IRunService | null;
176+
status_message?: string | null;
176177
}
177178

178179
declare interface IMetricsItem {

src/dstack/_internal/cli/utils/run.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -166,15 +166,15 @@ def get_runs_table(
166166
run_row: Dict[Union[str, int], Any] = {
167167
"NAME": run.run_spec.run_name,
168168
"SUBMITTED": format_date(run.submitted_at),
169+
"STATUS": (
170+
run.latest_job_submission.status_message
171+
if run.status.is_finished() and run.latest_job_submission
172+
else run.status_message
173+
),
169174
}
170175
if run.error:
171176
run_row["ERROR"] = run.error
172177
if len(run.jobs) != 1:
173-
run_row["STATUS"] = (
174-
run.latest_job_submission.status_message
175-
if run.latest_job_submission
176-
else run.status
177-
)
178178
add_row_from_dict(table, run_row)
179179

180180
for job in run.jobs:

src/dstack/_internal/core/models/runs.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,7 @@ class Run(CoreModel):
496496
submitted_at: datetime
497497
last_processed_at: datetime
498498
status: RunStatus
499+
status_message: Optional[str] = None
499500
termination_reason: Optional[RunTerminationReason]
500501
run_spec: RunSpec
501502
jobs: List[Job]
@@ -524,6 +525,49 @@ def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[s
524525
else:
525526
return None
526527

528+
@root_validator
529+
def _status_message(cls, values) -> Dict:
530+
try:
531+
status = values["status"]
532+
run_spec: RunSpec = values["run_spec"]
533+
retry_on_events = (
534+
run_spec.configuration.retry.on_events
535+
if run_spec and run_spec.configuration.retry
536+
else []
537+
)
538+
jobs = values["jobs"]
539+
termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
540+
except KeyError:
541+
return values
542+
values["status_message"] = Run._get_status_message(
543+
status=status,
544+
retry_on_events=retry_on_events,
545+
termination_reason=termination_reason,
546+
)
547+
return values
548+
549+
@staticmethod
550+
def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
551+
for submission in reversed(job.job_submissions):
552+
if submission.termination_reason is not None:
553+
return submission.termination_reason
554+
return None
555+
556+
@staticmethod
557+
def _get_status_message(
558+
status: RunStatus,
559+
retry_on_events: List[RetryEvent],
560+
termination_reason: Optional[JobTerminationReason],
561+
) -> str:
562+
# Currently, `retrying` is shown only for `no-capacity` events
563+
if (
564+
status in [RunStatus.SUBMITTED, RunStatus.PENDING]
565+
and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
566+
and RetryEvent.NO_CAPACITY in retry_on_events
567+
):
568+
return "retrying"
569+
return status.value
570+
527571

528572
class JobPlan(CoreModel):
529573
job_spec: JobSpec

src/dstack/api/server/_runs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
100100
current_resource = plan.current_resource
101101
if current_resource is not None:
102102
current_resource_excludes = {}
103+
current_resource_excludes["status_message"] = True
103104
apply_plan_excludes["current_resource"] = current_resource_excludes
104105
current_resource_excludes["run_spec"] = _get_run_spec_excludes(current_resource.run_spec)
105106
job_submissions_excludes = {}

src/tests/_internal/server/routers/test_runs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ def get_dev_env_run_dict(
247247
"submitted_at": submitted_at,
248248
"last_processed_at": last_processed_at,
249249
"status": "submitted",
250+
"status_message": "submitted",
250251
"run_spec": {
251252
"configuration": {
252253
"entrypoint": None,
@@ -510,6 +511,7 @@ async def test_lists_runs(self, test_db, session: AsyncSession, client: AsyncCli
510511
"submitted_at": run1_submitted_at.isoformat(),
511512
"last_processed_at": run1_submitted_at.isoformat(),
512513
"status": "submitted",
514+
"status_message": "submitted",
513515
"run_spec": run1_spec.dict(),
514516
"jobs": [
515517
{
@@ -563,6 +565,7 @@ async def test_lists_runs(self, test_db, session: AsyncSession, client: AsyncCli
563565
"submitted_at": run2_submitted_at.isoformat(),
564566
"last_processed_at": run2_submitted_at.isoformat(),
565567
"status": "submitted",
568+
"status_message": "submitted",
566569
"run_spec": run2_spec.dict(),
567570
"jobs": [],
568571
"latest_job_submission": None,

0 commit comments

Comments
 (0)