Skip to content

Commit ca0319a

Browse files
authored
Fix terminating TensorDock instances (#2480)
- Do not handle exceptions in `terminate_instance`, so that they can be detected and termination can be retried - Ignore termination errors if instance is already terminated - Increase request timeout, termination and other requests often take more than 12 seconds
1 parent 4646bfe commit ca0319a

2 files changed

Lines changed: 7 additions & 19 deletions

File tree

src/dstack/_internal/core/backends/tensordock/api_client.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from dstack._internal.utils.logging import get_logger
99

1010
logger = get_logger(__name__)
11-
REQUEST_TIMEOUT = 12
11+
REQUEST_TIMEOUT = 20
1212

1313

1414
class TensorDockAPIClient:
@@ -80,7 +80,7 @@ def deploy_single(self, instance_name: str, instance: InstanceType, cloudinit: d
8080
data["password"] = form["password"]
8181
return data
8282

83-
def delete_single(self, instance_id: str):
83+
def delete_single_if_exists(self, instance_id: str):
8484
logger.debug("Deleting instance %s", instance_id)
8585
resp = self.s.post(
8686
self._url("/client/delete/single"),
@@ -91,10 +91,11 @@ def delete_single(self, instance_id: str):
9191
},
9292
timeout=REQUEST_TIMEOUT,
9393
)
94-
resp.raise_for_status()
9594
try:
9695
data = resp.json()
97-
if not data["success"]:
96+
if "already terminated" in data.get("error", ""):
97+
return
98+
if not data.get("success"):
9899
raise BackendError(data)
99100
except ValueError: # json parsing error
100101
raise BackendError(resp.text)

src/dstack/_internal/core/backends/tensordock/compute.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from dstack._internal.core.backends.base.offers import get_catalog_offers
1313
from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient
1414
from dstack._internal.core.backends.tensordock.models import TensorDockConfig
15-
from dstack._internal.core.errors import BackendError, NoCapacityError
15+
from dstack._internal.core.errors import NoCapacityError
1616
from dstack._internal.core.models.backends.base import BackendType
1717
from dstack._internal.core.models.instances import (
1818
InstanceAvailability,
@@ -117,17 +117,4 @@ def create_instance(
117117
def terminate_instance(
118118
self, instance_id: str, region: str, backend_data: Optional[str] = None
119119
):
120-
try:
121-
self.api_client.delete_single(instance_id)
122-
except requests.HTTPError as e:
123-
logger.error(
124-
"An HTTP error occurred when trying to terminate TensorDock instance %s: %s",
125-
instance_id,
126-
e,
127-
)
128-
except BackendError as e:
129-
logger.error(
130-
"TensorDock returned an error when trying to terminate instance %s: %s",
131-
instance_id,
132-
e,
133-
)
120+
self.api_client.delete_single_if_exists(instance_id)

0 commit comments

Comments
 (0)