dstackai
diff --git a/‎docs/docs/concepts/services.md‎
Lines changed: 3 additions & 1 deletion b/‎docs/docs/concepts/services.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/dstack/_internal/cli/utils/run.py‎
Lines changed: 31 additions & 1 deletion b/‎src/dstack/_internal/cli/utils/run.py‎
Lines changed: 31 additions & 1 deletion
diff --git a/‎src/dstack/_internal/core/compatibility/runs.py‎
Lines changed: 2 additions & 0 deletions b/‎src/dstack/_internal/core/compatibility/runs.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/dstack/_internal/core/models/configurations.py‎
Lines changed: 59 additions & 1 deletion b/‎src/dstack/_internal/core/models/configurations.py‎
Lines changed: 59 additions & 1 deletion
diff --git a/‎src/dstack/_internal/core/models/runs.py‎
Lines changed: 7 additions & 0 deletions b/‎src/dstack/_internal/core/models/runs.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/dstack/_internal/core/services/ssh/tunnel.py‎
Lines changed: 7 additions & 0 deletions b/‎src/dstack/_internal/core/services/ssh/tunnel.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/dstack/_internal/server/app.py‎
Lines changed: 3 additions & 0 deletions b/‎src/dstack/_internal/server/app.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/dstack/_internal/server/background/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/dstack/_internal/server/background/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -783,7 +783,7 @@ The rolling deployment stops when all replicas are updated or when a new deploym
 ??? info "Supported properties"
     <!-- NOTE: should be in sync with constants in server/services/runs.py -->
 
-    Rolling deployment supports changes to the following properties: `port`, `resources`, `volumes`, `docker`, `files`, `image`, `user`, `privileged`, `entrypoint`, `working_dir`, `python`, `nvcc`, `single_branch`, `env`, `shell`, `commands`, as well as changes to [repo](repos.md) or [file](#files) contents.
+    Rolling deployment supports changes to the following properties: `port`, `probes`, `resources`, `volumes`, `docker`, `files`, `image`, `user`, `privileged`, `entrypoint`, `working_dir`, `python`, `nvcc`, `single_branch`, `env`, `shell`, `commands`, as well as changes to [repo](repos.md) or [file](#files) contents.
 
     Changes to `replicas` and `scaling` can be applied without redeploying replicas.
 
@@ -792,6 +792,8 @@ The rolling deployment stops when all replicas are updated or when a new deploym
     To trigger a rolling deployment when no properties have changed (e.g., after updating [secrets](secrets.md) or to restart all replicas),  
     make a minor config change, such as adding a dummy [environment variable](#environment-variables).
 
+<!-- TODO: probes -->
+
 --8<-- "docs/concepts/snippets/manage-runs.ext"
 
 !!! info "What's next?"
 
@@ -5,18 +5,21 @@
 from rich.table import Table
 
 from dstack._internal.cli.utils.common import NO_OFFERS_WARNING, add_row_from_dict, console
-from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
+from dstack._internal.core.models.configurations import DevEnvironmentConfiguration, ProbeConfig
 from dstack._internal.core.models.instances import InstanceAvailability
 from dstack._internal.core.models.profiles import (
     DEFAULT_RUN_TERMINATION_IDLE_TIME,
     TerminationPolicy,
 )
 from dstack._internal.core.models.runs import (
+    JobStatus,
+    Probe,
     RunPlan,
 )
 from dstack._internal.core.services.profiles import get_termination
 from dstack._internal.utils.common import (
     DateFormatter,
+    batched,
     format_duration_multiunit,
     format_pretty_duration,
     pretty_date,
@@ -156,6 +159,12 @@ def get_runs_table(
         table.add_column("INSTANCE TYPE", no_wrap=True, ratio=1)
     table.add_column("PRICE", style="grey58", ratio=1)
     table.add_column("STATUS", no_wrap=True, ratio=1)
+    if verbose or any(
+        run._run.is_deployment_in_progress()
+        and any(job.job_submissions[-1].probes for job in run._run.jobs)
+        for run in runs
+    ):
+        table.add_column("PROBES", ratio=1)
     table.add_column("SUBMITTED", style="grey58", no_wrap=True, ratio=1)
     if verbose:
         table.add_column("ERROR", no_wrap=True, ratio=2)
@@ -198,6 +207,9 @@ def get_runs_table(
                     else ""
                 ),
                 "STATUS": latest_job_submission.status_message,
+                "PROBES": _format_job_probes(
+                    job.job_spec.probes, latest_job_submission.probes, latest_job_submission.status
+                ),
                 "SUBMITTED": format_date(latest_job_submission.submitted_at),
                 "ERROR": latest_job_submission.error,
             }
@@ -226,3 +238,21 @@ def get_runs_table(
             add_row_from_dict(table, job_row, style="secondary" if len(run.jobs) != 1 else None)
 
     return table
+
+
+def _format_job_probes(
+    probe_configs: list[ProbeConfig], probes: list[Probe], job_status: JobStatus
+) -> str:
+    if not probes or job_status != JobStatus.RUNNING:
+        return ""
+    statuses = []
+    for probe_config, probe in zip(probe_configs, probes):
+        if probe.success_streak >= probe_config.ready_after:
+            status = "[code]✓[/]"
+        elif probe.success_streak > 0:
+            status = "[warning]~[/]"
+        else:
+            status = "[error]×[/]"
+        statuses.append(status)
+    # split into whitespace-delimited batches to allow column wrapping
+    return " ".join("".join(batch) for batch in batched(statuses, 5))
@@ -120,6 +120,7 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
         profile_excludes.add("startup_order")
     if configuration.stop_criteria is None:
         configuration_excludes["stop_criteria"] = True
+    # TODO: probes
     if profile is not None and profile.stop_criteria is None:
         profile_excludes.add("stop_criteria")
     if not configuration.files:
@@ -154,6 +155,7 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
         spec_excludes["file_archives"] = True
     if all(s.service_port is None for s in job_specs):
         spec_excludes["service_port"] = True
+    # TODO: probes
 
     return spec_excludes
 
 
@@ -14,7 +14,7 @@
 from dstack._internal.core.models.files import FilePathMapping
 from dstack._internal.core.models.fleets import FleetConfiguration
 from dstack._internal.core.models.gateways import GatewayConfiguration
-from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
+from dstack._internal.core.models.profiles import ProfileParams, parse_duration, parse_off_duration
 from dstack._internal.core.models.resources import Range, ResourcesSpec
 from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
 from dstack._internal.core.models.unix import UnixUser
@@ -32,6 +32,8 @@
 RUN_PRIOTIRY_MAX = 100
 RUN_PRIORITY_DEFAULT = 0
 DEFAULT_REPO_DIR = "/workflow"
+MIN_PROBE_TIMEOUT = 1
+MIN_PROBE_INTERVAL = 1
 
 
 class RunConfigurationType(str, Enum):
@@ -162,6 +164,58 @@ class RateLimit(CoreModel):
     ] = 0
 
 
+class ProbeConfig(CoreModel):
+    type: Literal["http"]  # expect other probe types in the future, namely `exec`
+    url: Annotated[str, Field(description="The URL to request")] = "/"
+    timeout: Annotated[
+        Union[int, str],
+        Field(description=("Maximum amount of time the HTTP request is allowed to take")),
+    ] = "10s"
+    interval: Annotated[
+        Union[int, str],
+        Field(
+            description=(
+                "Minimum amount of time between the end of one probe execution"
+                " and the start of the next"
+            )
+        ),
+    ] = "15s"
+    ready_after: Annotated[
+        int,
+        Field(
+            ge=1,
+            description=(
+                "The number of consecutive successful probe executions required for the job"
+                " to be considered ready. Used during rolling deployments"
+            ),
+        ),
+    ] = 1
+
+    class Config:
+        frozen = True
+
+    @validator("timeout")
+    def parse_timeout(cls, v: Union[int, str]) -> int:
+        parsed = parse_duration(v)
+        if parsed < MIN_PROBE_TIMEOUT:
+            raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s")
+        return parsed
+
+    @validator("interval")
+    def parse_interval(cls, v: Union[int, str]) -> int:
+        parsed = parse_duration(v)
+        if parsed < MIN_PROBE_INTERVAL:
+            raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s")
+        return parsed
+
+    @validator("url")
+    def validate_url(cls, v: str) -> str:
+        # TODO: stricter constraints to avoid HTTPX URL parsing errors
+        if not v.startswith("/"):
+            raise ValueError("Must start with `/`")
+        return v
+
+
 class BaseRunConfiguration(CoreModel):
     type: Literal["none"]
     name: Annotated[
@@ -448,6 +502,10 @@ class ServiceConfigurationParams(CoreModel):
         Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
     ] = None
     rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
+    probes: Annotated[
+        list[ProbeConfig],
+        Field(unique_items=True, description="List of probes used to determine job health"),
+    ] = []
 
     @validator("port")
     def convert_port(cls, v) -> PortMapping:
 
@@ -10,6 +10,7 @@
 from dstack._internal.core.models.configurations import (
     DEFAULT_REPO_DIR,
     AnyRunConfiguration,
+    ProbeConfig,
     RunConfiguration,
     ServiceConfiguration,
 )
@@ -256,6 +257,7 @@ class JobSpec(CoreModel):
     file_archives: list[FileArchiveMapping] = []
     # None for non-services and pre-0.19.19 services. See `get_service_port`
     service_port: Optional[int] = None
+    probes: list[ProbeConfig] = []
 
 
 class JobProvisioningData(CoreModel):
@@ -325,6 +327,10 @@ class ClusterInfo(CoreModel):
     gpus_per_job: int
 
 
+class Probe(CoreModel):
+    success_streak: int
+
+
 class JobSubmission(CoreModel):
     id: UUID4
     submission_num: int
@@ -341,6 +347,7 @@ class JobSubmission(CoreModel):
     job_provisioning_data: Optional[JobProvisioningData]
     job_runtime_data: Optional[JobRuntimeData]
     error: Optional[str] = None
+    probes: list[Probe] = []
 
     @property
     def age(self) -> timedelta:
 
@@ -236,6 +236,13 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
 
+    async def __aenter__(self):
+        await self.aopen()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.aclose()
+
     def _get_proxy_command(self) -> Optional[str]:
         proxy_command: Optional[str] = None
         for params, identity_path in self.ssh_proxies:
 
@@ -21,6 +21,7 @@
 from dstack._internal.proxy.lib.routers import model_proxy
 from dstack._internal.server import settings
 from dstack._internal.server.background import start_background_tasks
+from dstack._internal.server.background.tasks.process_probes import PROBES_SCHEDULER
 from dstack._internal.server.db import get_db, get_session_ctx, migrate
 from dstack._internal.server.routers import (
     backends,
@@ -155,6 +156,7 @@ async def lifespan(app: FastAPI):
         scheduler = start_background_tasks()
     else:
         logger.info("Background processing is disabled")
+    PROBES_SCHEDULER.start()
     dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
     logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
     logger.info(
@@ -166,6 +168,7 @@ async def lifespan(app: FastAPI):
     yield
     if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
         scheduler.shutdown()
+    PROBES_SCHEDULER.shutdown(wait=False)
     await gateway_connections_pool.remove_all()
     service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
     await service_conn_pool.remove_all()
 
@@ -18,6 +18,7 @@
 from dstack._internal.server.background.tasks.process_placement_groups import (
     process_placement_groups,
 )
+from dstack._internal.server.background.tasks.process_probes import process_probes
 from dstack._internal.server.background.tasks.process_prometheus_metrics import (
     collect_prometheus_metrics,
     delete_prometheus_metrics,
@@ -63,6 +64,7 @@ def start_background_tasks() -> AsyncIOScheduler:
     # that the first waiting for the lock will acquire it.
     # The jitter is needed to give all tasks a chance to acquire locks.
 
+    _scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1))
     _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
     _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
     if settings.ENABLE_PROMETHEUS_METRICS: