Skip to content

Commit 9d4b2ef

Browse files
[Bug]: nebius.aio.service_error.RequestError: Request error DEADLINE_EXCEEDED: Deadline Exceeded #2962 (#3028)
1 parent 7058a36 commit 9d4b2ef

File tree

2 files changed

+22
-12
lines changed

2 files changed

+22
-12
lines changed

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
364364
)
365365
time.sleep(WAIT_FOR_INSTANCE_UPDATE_INTERVAL)
366366
resources.LOOP.await_(
367-
op.update(timeout=resources.REQUEST_TIMEOUT, metadata=resources.REQUEST_MD)
367+
op.update(per_retry_timeout=resources.REQUEST_TIMEOUT, metadata=resources.REQUEST_MD)
368368
)
369369

370370

src/dstack/_internal/core/backends/nebius/resources.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def wait_for_operation(
119119
if time.monotonic() + interval > deadline:
120120
raise TimeoutError(f"Operation {op.id} wait timeout")
121121
time.sleep(interval)
122-
LOOP.await_(op.update(timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
122+
LOOP.await_(op.update(per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
123123

124124

125125
def get_region_to_project_id_map(
@@ -155,7 +155,7 @@ def validate_regions(configured: set[str], available: set[str]) -> None:
155155
def list_tenant_projects(sdk: SDK) -> Sequence[Container]:
156156
tenants = LOOP.await_(
157157
TenantServiceClient(sdk).list(
158-
ListTenantsRequest(), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
158+
ListTenantsRequest(), per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
159159
)
160160
)
161161
if len(tenants.items) != 1:
@@ -164,7 +164,7 @@ def list_tenant_projects(sdk: SDK) -> Sequence[Container]:
164164
projects = LOOP.await_(
165165
ProjectServiceClient(sdk).list(
166166
ListProjectsRequest(parent_id=tenant_id, page_size=999),
167-
timeout=REQUEST_TIMEOUT,
167+
per_retry_timeout=REQUEST_TIMEOUT,
168168
metadata=REQUEST_MD,
169169
)
170170
)
@@ -238,7 +238,7 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
238238
subnets = LOOP.await_(
239239
SubnetServiceClient(sdk).list(
240240
ListSubnetsRequest(parent_id=project_id, page_size=999),
241-
timeout=REQUEST_TIMEOUT,
241+
per_retry_timeout=REQUEST_TIMEOUT,
242242
metadata=REQUEST_MD,
243243
)
244244
)
@@ -264,13 +264,15 @@ def create_disk(
264264
),
265265
)
266266
with wrap_capacity_errors():
267-
return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
267+
return LOOP.await_(
268+
client.create(request, per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD)
269+
)
268270

269271

270272
def delete_disk(sdk: SDK, disk_id: str) -> None:
271273
LOOP.await_(
272274
DiskServiceClient(sdk).delete(
273-
DeleteDiskRequest(id=disk_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
275+
DeleteDiskRequest(id=disk_id), per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
274276
)
275277
)
276278

@@ -318,21 +320,27 @@ def create_instance(
318320
),
319321
)
320322
with wrap_capacity_errors():
321-
return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
323+
return LOOP.await_(
324+
client.create(request, per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD)
325+
)
322326

323327

324328
def get_instance(sdk: SDK, instance_id: str) -> Instance:
325329
return LOOP.await_(
326330
InstanceServiceClient(sdk).get(
327-
GetInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
331+
GetInstanceRequest(id=instance_id),
332+
per_retry_timeout=REQUEST_TIMEOUT,
333+
metadata=REQUEST_MD,
328334
)
329335
)
330336

331337

332338
def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
333339
return LOOP.await_(
334340
InstanceServiceClient(sdk).delete(
335-
DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
341+
DeleteInstanceRequest(id=instance_id),
342+
per_retry_timeout=REQUEST_TIMEOUT,
343+
metadata=REQUEST_MD,
336344
)
337345
)
338346

@@ -345,7 +353,7 @@ def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOper
345353
metadata=ResourceMetadata(name=name, parent_id=project_id),
346354
spec=GpuClusterSpec(infiniband_fabric=fabric),
347355
),
348-
timeout=REQUEST_TIMEOUT,
356+
per_retry_timeout=REQUEST_TIMEOUT,
349357
metadata=REQUEST_MD,
350358
)
351359
)
@@ -354,6 +362,8 @@ def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOper
354362
def delete_cluster(sdk: SDK, cluster_id: str) -> None:
355363
return LOOP.await_(
356364
GpuClusterServiceClient(sdk).delete(
357-
DeleteGpuClusterRequest(id=cluster_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
365+
DeleteGpuClusterRequest(id=cluster_id),
366+
per_retry_timeout=REQUEST_TIMEOUT,
367+
metadata=REQUEST_MD,
358368
)
359369
)

0 commit comments

Comments
 (0)