Skip to content

Commit 8951afb

Browse files
authored
Switch to nebius sdk 0.3 (#3222)
* Switch to nebius sdk 0.3 * Pass auth_options to op.update
1 parent 1c56561 commit 8951afb

File tree

3 files changed

+32
-23
lines changed

3 files changed

+32
-23
lines changed

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,7 @@ oci = [
190190
"dstack[server]",
191191
]
192192
nebius = [
193-
# 0.2.73 breaks sdk backward compatibility: https://github.com/dstackai/dstack/issues/3171
194-
"nebius>=0.2.40,<=0.2.72; python_version >= '3.10'",
193+
"nebius>=0.3.4,<0.4; python_version >= '3.10'",
195194
"dstack[server]",
196195
]
197196
all = [

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,10 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
380380
)
381381
time.sleep(WAIT_FOR_INSTANCE_UPDATE_INTERVAL)
382382
resources.LOOP.await_(
383-
op.update(per_retry_timeout=resources.REQUEST_TIMEOUT, metadata=resources.REQUEST_MD)
383+
op.update(
384+
per_retry_timeout=resources.REQUEST_TIMEOUT,
385+
auth_options=resources.REQUEST_AUTH_OPTIONS,
386+
)
384387
)
385388

386389

src/dstack/_internal/core/backends/nebius/resources.py

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from tempfile import NamedTemporaryFile
99
from typing import Dict, Optional
1010

11-
from nebius.aio.authorization.options import options_to_metadata
1211
from nebius.aio.operation import Operation as SDKOperation
1312
from nebius.aio.service_error import RequestError, StatusCode
1413
from nebius.aio.token.renewable import OPTION_RENEW_REQUEST_TIMEOUT, OPTION_RENEW_SYNCHRONOUS
@@ -66,13 +65,11 @@
6665
LOOP = DaemonEventLoop()
6766
# Pass a timeout to all methods to avoid infinite waiting
6867
REQUEST_TIMEOUT = 10
69-
# Pass REQUEST_MD to all methods to avoid infinite retries in case of invalid credentials
70-
REQUEST_MD = options_to_metadata(
71-
{
72-
OPTION_RENEW_SYNCHRONOUS: "true",
73-
OPTION_RENEW_REQUEST_TIMEOUT: "5",
74-
}
75-
)
68+
# Pass REQUEST_AUTH_OPTIONS to all methods to avoid infinite retries in case of invalid credentials
69+
REQUEST_AUTH_OPTIONS = {
70+
OPTION_RENEW_SYNCHRONOUS: "true",
71+
OPTION_RENEW_REQUEST_TIMEOUT: "5",
72+
}
7673

7774
# disables log messages about errors such as invalid creds or expired timeouts
7875
logging.getLogger("nebius").setLevel(logging.CRITICAL)
@@ -120,7 +117,9 @@ def wait_for_operation(
120117
if time.monotonic() + interval > deadline:
121118
raise TimeoutError(f"Operation {op.id} wait timeout")
122119
time.sleep(interval)
123-
LOOP.await_(op.update(per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
120+
LOOP.await_(
121+
op.update(per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS)
122+
)
124123

125124

126125
def get_region_to_project_id_map(
@@ -156,7 +155,9 @@ def validate_regions(configured: set[str], available: set[str]) -> None:
156155
def list_tenant_projects(sdk: SDK) -> Sequence[Container]:
157156
tenants = LOOP.await_(
158157
TenantServiceClient(sdk).list(
159-
ListTenantsRequest(), per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
158+
ListTenantsRequest(),
159+
per_retry_timeout=REQUEST_TIMEOUT,
160+
auth_options=REQUEST_AUTH_OPTIONS,
160161
)
161162
)
162163
if len(tenants.items) != 1:
@@ -166,7 +167,7 @@ def list_tenant_projects(sdk: SDK) -> Sequence[Container]:
166167
ProjectServiceClient(sdk).list(
167168
ListProjectsRequest(parent_id=tenant_id, page_size=999),
168169
per_retry_timeout=REQUEST_TIMEOUT,
169-
metadata=REQUEST_MD,
170+
auth_options=REQUEST_AUTH_OPTIONS,
170171
)
171172
)
172173
return projects.items
@@ -240,7 +241,7 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
240241
SubnetServiceClient(sdk).list(
241242
ListSubnetsRequest(parent_id=project_id, page_size=999),
242243
per_retry_timeout=REQUEST_TIMEOUT,
243-
metadata=REQUEST_MD,
244+
auth_options=REQUEST_AUTH_OPTIONS,
244245
)
245246
)
246247
for subnet in subnets.items:
@@ -267,14 +268,18 @@ def create_disk(
267268
)
268269
with wrap_capacity_errors():
269270
return LOOP.await_(
270-
client.create(request, per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD)
271+
client.create(
272+
request, per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS
273+
)
271274
)
272275

273276

274277
def delete_disk(sdk: SDK, disk_id: str) -> None:
275278
LOOP.await_(
276279
DiskServiceClient(sdk).delete(
277-
DeleteDiskRequest(id=disk_id), per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
280+
DeleteDiskRequest(id=disk_id),
281+
per_retry_timeout=REQUEST_TIMEOUT,
282+
auth_options=REQUEST_AUTH_OPTIONS,
278283
)
279284
)
280285

@@ -325,7 +330,9 @@ def create_instance(
325330
)
326331
with wrap_capacity_errors():
327332
return LOOP.await_(
328-
client.create(request, per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD)
333+
client.create(
334+
request, per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS
335+
)
329336
)
330337

331338

@@ -334,7 +341,7 @@ def get_instance(sdk: SDK, instance_id: str) -> Instance:
334341
InstanceServiceClient(sdk).get(
335342
GetInstanceRequest(id=instance_id),
336343
per_retry_timeout=REQUEST_TIMEOUT,
337-
metadata=REQUEST_MD,
344+
auth_options=REQUEST_AUTH_OPTIONS,
338345
)
339346
)
340347

@@ -344,7 +351,7 @@ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
344351
InstanceServiceClient(sdk).delete(
345352
DeleteInstanceRequest(id=instance_id),
346353
per_retry_timeout=REQUEST_TIMEOUT,
347-
metadata=REQUEST_MD,
354+
auth_options=REQUEST_AUTH_OPTIONS,
348355
)
349356
)
350357

@@ -358,17 +365,17 @@ def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOper
358365
spec=GpuClusterSpec(infiniband_fabric=fabric),
359366
),
360367
per_retry_timeout=REQUEST_TIMEOUT,
361-
metadata=REQUEST_MD,
368+
auth_options=REQUEST_AUTH_OPTIONS,
362369
)
363370
)
364371

365372

366373
def delete_cluster(sdk: SDK, cluster_id: str) -> None:
367-
return LOOP.await_(
374+
LOOP.await_(
368375
GpuClusterServiceClient(sdk).delete(
369376
DeleteGpuClusterRequest(id=cluster_id),
370377
per_retry_timeout=REQUEST_TIMEOUT,
371-
metadata=REQUEST_MD,
378+
auth_options=REQUEST_AUTH_OPTIONS,
372379
)
373380
)
374381

0 commit comments

Comments
 (0)