dstackai
diff --git a/‎docs/docs/concepts/services.md‎
Lines changed: 49 additions & 2 deletions b/‎docs/docs/concepts/services.md‎
Lines changed: 49 additions & 2 deletions
diff --git a/‎docs/docs/reference/dstack.yml/service.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/docs/reference/dstack.yml/service.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/docs/reference/environment-variables.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/docs/reference/environment-variables.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/dstack/_internal/cli/utils/run.py‎
Lines changed: 32 additions & 0 deletions b/‎src/dstack/_internal/cli/utils/run.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎src/dstack/_internal/core/compatibility/runs.py‎
Lines changed: 8 additions & 0 deletions b/‎src/dstack/_internal/core/compatibility/runs.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/dstack/_internal/core/models/configurations.py‎
Lines changed: 90 additions & 1 deletion b/‎src/dstack/_internal/core/models/configurations.py‎
Lines changed: 90 additions & 1 deletion
diff --git a/‎src/dstack/_internal/core/models/runs.py‎
Lines changed: 15 additions & 1 deletion b/‎src/dstack/_internal/core/models/runs.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/dstack/_internal/core/services/ssh/tunnel.py‎
Lines changed: 7 additions & 0 deletions b/‎src/dstack/_internal/core/services/ssh/tunnel.py‎
Lines changed: 7 additions & 0 deletions
@@ -187,6 +187,53 @@ port: 8000
 
 </div>
 
+### Probes
+
+Configure one or more HTTP probes to periodically check the health of the service.
+
+<div editor-title="service.dstack.yml">
+
+```yaml
+type: service
+name: my-service
+port: 80
+image: my-app:latest
+probes:
+- type: http
+  url: /health
+  interval: 15s
+```
+
+</div>
+
+You can track probe statuses in `dstack ps --verbose`.
+
+<div class="termy">
+
+```shell
+$ dstack ps --verbose
+
+ NAME                            BACKEND          STATUS   PROBES  SUBMITTED
+ my-service deployment=1                          running          11 mins ago
+   replica=0 job=0 deployment=0  aws (us-west-2)  running  ✓       11 mins ago
+   replica=1 job=0 deployment=1  aws (us-west-2)  running  ×       1 min ago
+```
+
+</div>
+
+??? info "Probe statuses"
+    The following symbols are used for probe statuses:
+
+    - `×` &mdash; the last probe execution failed.
+    - `~` &mdash; the last probe execution succeeded, but the [`ready_after`](../reference/dstack.yml/service.md#ready_after) threshold is not yet reached.
+    - `✓` &mdash; the last `ready_after` probe executions succeeded.
+
+    If multiple probes are configured for the service, their statuses are displayed in the order in which the probes appear in the configuration.
+
+Probes are executed for each service replica while the replica is `running`. Probe statuses do not affect how `dstack` handles replicas, except during [rolling deployments](#rolling-deployment).
+
+See the [reference](../reference/dstack.yml/service.md#probes) for more probe configuration options.
+
 ### Path prefix { #path-prefix }
 
 If your `dstack` project doesn't have a [gateway](gateways.md), services are hosted with the
@@ -758,7 +805,7 @@ Update the run? [y/n]:
 
 </div>
 
-If approved, `dstack` gradually updates the service replicas. To update a replica, `dstack` starts a new replica, waits for it to become `running`, then terminates the old replica. This process is repeated for each replica, one at a time.
+If approved, `dstack` gradually updates the service replicas. To update a replica, `dstack` starts a new replica, waits for it to become `running` and for all of its [probes](#probes) to pass, then terminates the old replica. This process is repeated for each replica, one at a time.
 
 You can track the progress of rolling deployment in both `dstack apply` or `dstack ps`. 
 Older replicas have lower `deployment` numbers; newer ones have higher.
@@ -783,7 +830,7 @@ The rolling deployment stops when all replicas are updated or when a new deploym
 ??? info "Supported properties"
     <!-- NOTE: should be in sync with constants in server/services/runs.py -->
 
-    Rolling deployment supports changes to the following properties: `port`, `resources`, `volumes`, `docker`, `files`, `image`, `user`, `privileged`, `entrypoint`, `working_dir`, `python`, `nvcc`, `single_branch`, `env`, `shell`, `commands`, as well as changes to [repo](repos.md) or [file](#files) contents.
+    Rolling deployment supports changes to the following properties: `port`, `probes`, `resources`, `volumes`, `docker`, `files`, `image`, `user`, `privileged`, `entrypoint`, `working_dir`, `python`, `nvcc`, `single_branch`, `env`, `shell`, `commands`, as well as changes to [repo](repos.md) or [file](#files) contents.
 
     Changes to `replicas` and `scaling` can be applied without redeploying replicas.
 
 
@@ -106,6 +106,16 @@ The `service` configuration type allows running [services](../../concepts/servic
           type:
             required: true
 
+### `probes`
+
+#### `probes[n]`
+
+#SCHEMA# dstack._internal.core.models.configurations.ProbeConfig
+    overrides:
+      show_root_heading: false
+      type:
+        required: true
+
 ### `retry`
 
 #SCHEMA# dstack._internal.core.models.profiles.ProfileRetry
 
@@ -124,6 +124,8 @@ For more details on the options below, refer to the [server deployment](../guide
 - `DSTACK_DB_MAX_OVERFLOW`{ #DSTACK_DB_MAX_OVERFLOW } - The client DB connections pool allowed overflow. Defaults to `20`.
 - `DSTACK_SERVER_BACKGROUND_PROCESSING_FACTOR`{ #DSTACK_SERVER_BACKGROUND_PROCESSING_FACTOR } - The number of background jobs for processing server resources. Increase if you need to process more resources per server replica quickly. Defaults to `1`.
 - `DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED`{ #DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED } - Disables background processing if set to any value. Useful to run only web frontend and API server.
+- `DSTACK_SERVER_MAX_PROBES_PER_JOB`{ #DSTACK_SERVER_MAX_PROBES_PER_JOB } - Maximum number of probes allowed in a run configuration. Validated at apply time.
+- `DSTACK_SERVER_MAX_PROBE_TIMEOUT`{ #DSTACK_SERVER_MAX_PROBE_TIMEOUT } - Maximum allowed timeout for a probe. Validated at apply time.
 
 ??? info "Internal environment variables"
      The following environment variables are intended for development purposes:
 
@@ -12,11 +12,15 @@
     TerminationPolicy,
 )
 from dstack._internal.core.models.runs import (
+    JobStatus,
+    Probe,
+    ProbeSpec,
     RunPlan,
 )
 from dstack._internal.core.services.profiles import get_termination
 from dstack._internal.utils.common import (
     DateFormatter,
+    batched,
     format_duration_multiunit,
     format_pretty_duration,
     pretty_date,
@@ -156,6 +160,12 @@ def get_runs_table(
         table.add_column("INSTANCE TYPE", no_wrap=True, ratio=1)
     table.add_column("PRICE", style="grey58", ratio=1)
     table.add_column("STATUS", no_wrap=True, ratio=1)
+    if verbose or any(
+        run._run.is_deployment_in_progress()
+        and any(job.job_submissions[-1].probes for job in run._run.jobs)
+        for run in runs
+    ):
+        table.add_column("PROBES", ratio=1)
     table.add_column("SUBMITTED", style="grey58", no_wrap=True, ratio=1)
     if verbose:
         table.add_column("ERROR", no_wrap=True, ratio=2)
@@ -198,6 +208,9 @@ def get_runs_table(
                     else ""
                 ),
                 "STATUS": latest_job_submission.status_message,
+                "PROBES": _format_job_probes(
+                    job.job_spec.probes, latest_job_submission.probes, latest_job_submission.status
+                ),
                 "SUBMITTED": format_date(latest_job_submission.submitted_at),
                 "ERROR": latest_job_submission.error,
             }
@@ -226,3 +239,22 @@ def get_runs_table(
             add_row_from_dict(table, job_row, style="secondary" if len(run.jobs) != 1 else None)
 
     return table
+
+
+def _format_job_probes(
+    probe_specs: list[ProbeSpec], probes: list[Probe], job_status: JobStatus
+) -> str:
+    if not probes or job_status != JobStatus.RUNNING:
+        return ""
+    statuses = []
+    for probe_spec, probe in zip(probe_specs, probes):
+        # NOTE: the symbols are documented in concepts/services.md, keep in sync.
+        if probe.success_streak >= probe_spec.ready_after:
+            status = "[code]✓[/]"
+        elif probe.success_streak > 0:
+            status = "[warning]~[/]"
+        else:
+            status = "[error]×[/]"
+        statuses.append(status)
+    # split into whitespace-delimited batches to allow column wrapping
+    return " ".join("".join(batch) for batch in batched(statuses, 5))
@@ -53,6 +53,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
             job_submissions_excludes["exit_status"] = True
         if all(js.deployment_num == 0 for js in job_submissions):
             job_submissions_excludes["deployment_num"] = True
+        if all(not js.probes for js in job_submissions):
+            job_submissions_excludes["probes"] = True
         latest_job_submission = current_resource.latest_job_submission
         if latest_job_submission is not None:
             latest_job_submission_excludes: IncludeExcludeDictType = {}
@@ -69,6 +71,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
                 latest_job_submission_excludes["exit_status"] = True
             if latest_job_submission.deployment_num == 0:
                 latest_job_submission_excludes["deployment_num"] = True
+            if not latest_job_submission.probes:
+                latest_job_submission_excludes["probes"] = True
     return {"plan": apply_plan_excludes}
 
 
@@ -120,6 +124,8 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
         profile_excludes.add("startup_order")
     if configuration.stop_criteria is None:
         configuration_excludes["stop_criteria"] = True
+    if isinstance(configuration, ServiceConfiguration) and not configuration.probes:
+        configuration_excludes["probes"] = True
     if profile is not None and profile.stop_criteria is None:
         profile_excludes.add("stop_criteria")
     if not configuration.files:
@@ -154,6 +160,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
         spec_excludes["file_archives"] = True
     if all(s.service_port is None for s in job_specs):
         spec_excludes["service_port"] = True
+    if all(not s.probes for s in job_specs):
+        spec_excludes["probes"] = True
 
     return spec_excludes
 
 
@@ -14,7 +14,7 @@
 from dstack._internal.core.models.files import FilePathMapping
 from dstack._internal.core.models.fleets import FleetConfiguration
 from dstack._internal.core.models.gateways import GatewayConfiguration
-from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
+from dstack._internal.core.models.profiles import ProfileParams, parse_duration, parse_off_duration
 from dstack._internal.core.models.resources import Range, ResourcesSpec
 from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
 from dstack._internal.core.models.unix import UnixUser
@@ -32,6 +32,13 @@
 RUN_PRIOTIRY_MAX = 100
 RUN_PRIORITY_DEFAULT = 0
 DEFAULT_REPO_DIR = "/workflow"
+MIN_PROBE_TIMEOUT = 1
+MIN_PROBE_INTERVAL = 1
+DEFAULT_PROBE_URL = "/"
+DEFAULT_PROBE_TIMEOUT = 10
+DEFAULT_PROBE_INTERVAL = 15
+DEFAULT_PROBE_READY_AFTER = 1
+MAX_PROBE_URL_LEN = 2048
 
 
 class RunConfigurationType(str, Enum):
@@ -162,6 +169,74 @@ class RateLimit(CoreModel):
     ] = 0
 
 
+class ProbeConfig(CoreModel):
+    type: Literal["http"]  # expect other probe types in the future, namely `exec`
+    url: Annotated[
+        Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`")
+    ] = None
+    timeout: Annotated[
+        Optional[Union[int, str]],
+        Field(
+            description=(
+                f"Maximum amount of time the HTTP request is allowed to take. Defaults to `{DEFAULT_PROBE_TIMEOUT}s`"
+            )
+        ),
+    ] = None
+    interval: Annotated[
+        Optional[Union[int, str]],
+        Field(
+            description=(
+                "Minimum amount of time between the end of one probe execution"
+                f" and the start of the next. Defaults to `{DEFAULT_PROBE_INTERVAL}s`"
+            )
+        ),
+    ] = None
+    ready_after: Annotated[
+        Optional[int],
+        Field(
+            ge=1,
+            description=(
+                "The number of consecutive successful probe executions required for the replica"
+                " to be considered ready. Used during rolling deployments."
+                f" Defaults to `{DEFAULT_PROBE_READY_AFTER}`"
+            ),
+        ),
+    ] = None
+
+    class Config:
+        frozen = True
+
+    @validator("timeout")
+    def parse_timeout(cls, v: Optional[Union[int, str]]) -> Optional[int]:
+        if v is None:
+            return v
+        parsed = parse_duration(v)
+        if parsed < MIN_PROBE_TIMEOUT:
+            raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s")
+        return parsed
+
+    @validator("interval")
+    def parse_interval(cls, v: Optional[Union[int, str]]) -> Optional[int]:
+        if v is None:
+            return v
+        parsed = parse_duration(v)
+        if parsed < MIN_PROBE_INTERVAL:
+            raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s")
+        return parsed
+
+    @validator("url")
+    def validate_url(cls, v: Optional[str]) -> Optional[str]:
+        if v is None:
+            return v
+        if not v.startswith("/"):
+            raise ValueError("Must start with `/`")
+        if len(v) > MAX_PROBE_URL_LEN:
+            raise ValueError(f"Cannot be longer than {MAX_PROBE_URL_LEN} characters")
+        if not v.isprintable():
+            raise ValueError("Cannot contain non-printable characters")
+        return v
+
+
 class BaseRunConfiguration(CoreModel):
     type: Literal["none"]
     name: Annotated[
@@ -448,6 +523,10 @@ class ServiceConfigurationParams(CoreModel):
         Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
     ] = None
     rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
+    probes: Annotated[
+        list[ProbeConfig],
+        Field(description="List of probes used to determine job health"),
+    ] = []
 
     @validator("port")
     def convert_port(cls, v) -> PortMapping:
@@ -511,6 +590,16 @@ def validate_rate_limits(cls, v: list[RateLimit]) -> list[RateLimit]:
             )
         return v
 
+    @validator("probes")
+    def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
+        if len(v) != len(set(v)):
+            # Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug:
+            # https://github.com/pydantic/pydantic/issues/3765
+            # Because of the bug, our gen_schema_reference.py fails to determine the type of
+            # ServiceConfiguration.probes and insert the correct hyperlink.
+            raise ValueError("Probes must be unique")
+        return v
+
 
 class ServiceConfiguration(
     ProfileParams, BaseRunConfigurationWithCommands, ServiceConfigurationParams
 
@@ -1,6 +1,6 @@
 from datetime import datetime, timedelta
 from enum import Enum
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Literal, Optional, Type
 
 from pydantic import UUID4, Field, root_validator
 from typing_extensions import Annotated
@@ -223,6 +223,14 @@ class JobSSHKey(CoreModel):
     public: str
 
 
+class ProbeSpec(CoreModel):
+    type: Literal["http"]  # expect other probe types in the future, namely `exec`
+    url: str
+    timeout: int
+    interval: int
+    ready_after: int
+
+
 class JobSpec(CoreModel):
     replica_num: int = 0  # default value for backward compatibility
     job_num: int
@@ -256,6 +264,7 @@ class JobSpec(CoreModel):
     file_archives: list[FileArchiveMapping] = []
     # None for non-services and pre-0.19.19 services. See `get_service_port`
     service_port: Optional[int] = None
+    probes: list[ProbeSpec] = []
 
 
 class JobProvisioningData(CoreModel):
@@ -325,6 +334,10 @@ class ClusterInfo(CoreModel):
     gpus_per_job: int
 
 
+class Probe(CoreModel):
+    success_streak: int
+
+
 class JobSubmission(CoreModel):
     id: UUID4
     submission_num: int
@@ -341,6 +354,7 @@ class JobSubmission(CoreModel):
     job_provisioning_data: Optional[JobProvisioningData]
     job_runtime_data: Optional[JobRuntimeData]
     error: Optional[str] = None
+    probes: list[Probe] = []
 
     @property
     def age(self) -> timedelta:
 
@@ -236,6 +236,13 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
 
+    async def __aenter__(self):
+        await self.aopen()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.aclose()
+
     def _get_proxy_command(self) -> Optional[str]:
         proxy_command: Optional[str] = None
         for params, identity_path in self.ssh_proxies: