Skip to content

Commit ebd78e1

Browse files
peterschmidt85Andrey Cheptsov
andauthored
RunPod: add CPU offer collection in gpuhunt (#225)
* runpod: add CPU offers to gpuhunt catalog * runpod: gate CPU offers with runpod-cpu flag * runpod: require cpu disk limit and strengthen cpu checks - require diskLimitPerVcpu for CPU offers and set disk_size unconditionally - add CPU integrity assertions for runpod catalog rows - extend provider tests for CPU flag/spot/location invariants --------- Co-authored-by: Andrey Cheptsov <andrey.cheptsov@github.com>
1 parent 1c14ee5 commit ebd78e1

3 files changed

Lines changed: 341 additions & 0 deletions

File tree

src/gpuhunt/providers/runpod.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ def _fetch_offers(self) -> list[RawCatalogItem]:
6868

6969
cluster_catalog_items = self._fetch_cluster_offers()
7070
catalog_items.extend(cluster_catalog_items)
71+
cpu_catalog_items = self._fetch_cpu_offers()
72+
catalog_items.extend(cpu_catalog_items)
7173
return catalog_items
7274

7375
def _build_query_variables(self) -> list[dict]:
@@ -221,6 +223,89 @@ def _fetch_cluster_offers(self) -> list[RawCatalogItem]:
221223
cluster_catalog_items.append(catalog_item)
222224
return cluster_catalog_items
223225

226+
def _fetch_cpu_offers(self) -> list[RawCatalogItem]:
227+
response = _make_request({"query": cpu_data_centers_query, "variables": {}})
228+
data_centers = [dc["id"] for dc in response["data"]["dataCenters"] if dc["listed"]]
229+
if len(data_centers) == 0:
230+
return []
231+
232+
cpu_flavors_by_data_center: dict[str, list[dict]] = {}
233+
with ThreadPoolExecutor(max_workers=10) as executor:
234+
future_to_data_center = {
235+
executor.submit(self._get_cpu_flavors, dc_id): dc_id for dc_id in data_centers
236+
}
237+
for future, dc_id in future_to_data_center.items():
238+
try:
239+
cpu_flavors_by_data_center[dc_id] = future.result()
240+
except RequestException as e:
241+
logger.exception("Failed to get cpuFlavors data for %s: %s", dc_id, e)
242+
243+
catalog_items = []
244+
for dc_id in data_centers:
245+
cpu_flavors = cpu_flavors_by_data_center.get(dc_id)
246+
if cpu_flavors is None:
247+
continue
248+
catalog_items.extend(self._make_cpu_catalog_items(dc_id, cpu_flavors))
249+
return catalog_items
250+
251+
def _get_cpu_flavors(self, data_center_id: str) -> list[dict]:
252+
response = _make_request(
253+
{"query": query_cpu_flavors, "variables": {"dataCenterId": data_center_id}}
254+
)
255+
return response["data"]["cpuFlavors"]
256+
257+
def _make_cpu_catalog_items(
258+
self, data_center_id: str, cpu_flavors: list[dict]
259+
) -> list[RawCatalogItem]:
260+
items: list[RawCatalogItem] = []
261+
for flavor in cpu_flavors:
262+
specifics = flavor.get("specifics") or {}
263+
if specifics.get("stockStatus") is None:
264+
continue
265+
base_secure_price = specifics.get("securePrice")
266+
if base_secure_price is None:
267+
continue
268+
269+
min_vcpu = flavor.get("minVcpu")
270+
max_vcpu = flavor.get("maxVcpu")
271+
ram_multiplier = flavor.get("ramMultiplier")
272+
disk_limit_per_vcpu = flavor.get("diskLimitPerVcpu")
273+
if (
274+
min_vcpu is None
275+
or max_vcpu is None
276+
or ram_multiplier is None
277+
or disk_limit_per_vcpu is None
278+
):
279+
continue
280+
if min_vcpu <= 0 or max_vcpu <= 0 or min_vcpu > max_vcpu:
281+
continue
282+
if int(disk_limit_per_vcpu) <= 0:
283+
continue
284+
285+
for vcpu in _cpu_size_ladder(int(min_vcpu), int(max_vcpu)):
286+
# `ramMultiplier` maps vCPU to RAM in GB.
287+
memory = int(vcpu * int(ram_multiplier))
288+
disk_size = float(vcpu * int(disk_limit_per_vcpu))
289+
scale = vcpu / min_vcpu
290+
price = base_secure_price * scale
291+
items.append(
292+
RawCatalogItem(
293+
instance_name=f"{flavor['id']}-{vcpu}-{memory}",
294+
location=data_center_id,
295+
price=price,
296+
cpu=vcpu,
297+
memory=memory,
298+
gpu_count=0,
299+
gpu_name=None,
300+
gpu_memory=None,
301+
spot=False,
302+
disk_size=disk_size,
303+
flags=["runpod-cpu"],
304+
provider_data={},
305+
)
306+
)
307+
return items
308+
224309
def _get_gpu_vendor_and_name(
225310
self,
226311
gpu_id: str,
@@ -286,6 +371,45 @@ def _get_amd_gpu_name(name: str) -> Optional[str]:
286371
return None
287372

288373

374+
def _cpu_size_ladder(min_vcpu: int, max_vcpu: int) -> list[int]:
375+
sizes = []
376+
current = min_vcpu
377+
while current <= max_vcpu:
378+
sizes.append(current)
379+
current *= 2
380+
if sizes and sizes[-1] != max_vcpu:
381+
sizes.append(max_vcpu)
382+
return sorted(set(sizes))
383+
384+
385+
cpu_data_centers_query = """
386+
query CpuDataCenters {
387+
dataCenters {
388+
id
389+
listed
390+
}
391+
}
392+
"""
393+
394+
query_cpu_flavors = """
395+
query CpuFlavors($dataCenterId: String!) {
396+
cpuFlavors {
397+
id
398+
displayName
399+
groupId
400+
minVcpu
401+
maxVcpu
402+
ramMultiplier
403+
diskLimitPerVcpu
404+
specifics(input: { dataCenterId: $dataCenterId }) {
405+
stockStatus
406+
securePrice
407+
}
408+
}
409+
}
410+
"""
411+
412+
289413
gpu_types_query = """
290414
query GpuTypes {
291415
countryCodes

src/integrity_tests/test_runpod.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,11 @@ def test_gpu_present(data_rows):
5858
refs = set(name for _, name in get_gpu_map().values())
5959
gpus = set(select_row(data_rows, "gpu_name"))
6060
assert len(refs & gpus) > 7
61+
62+
63+
def test_cpu_offers_integrity(data_rows):
64+
cpu_rows = [row for row in data_rows if row["gpu_count"] == "0"]
65+
assert len(cpu_rows) > 0
66+
assert all("runpod-cpu" in row["flags"].split(",") for row in cpu_rows)
67+
assert all(row["spot"] == "False" for row in cpu_rows)
68+
assert all("-" in row["location"] for row in cpu_rows)

src/tests/providers/test_runpod.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
import pytest
2+
from requests import RequestException
3+
4+
from gpuhunt._internal.models import RawCatalogItem
5+
from gpuhunt.providers import runpod as runpod_module
6+
from gpuhunt.providers.runpod import RunpodProvider, _cpu_size_ladder
7+
8+
9+
def test_cpu_size_ladder():
10+
assert _cpu_size_ladder(2, 32) == [2, 4, 8, 16, 32]
11+
assert _cpu_size_ladder(3, 20) == [3, 6, 12, 20]
12+
13+
14+
def test_make_cpu_catalog_items():
15+
provider = object.__new__(RunpodProvider)
16+
cpu_flavors = [
17+
{
18+
"id": "cpu3g",
19+
"minVcpu": 2,
20+
"maxVcpu": 32,
21+
"ramMultiplier": 4,
22+
"diskLimitPerVcpu": 10,
23+
"specifics": {
24+
"stockStatus": "High",
25+
"securePrice": 0.08,
26+
"slsPrice": 0.1,
27+
},
28+
}
29+
]
30+
31+
items = provider._make_cpu_catalog_items("AP-JP-1", cpu_flavors)
32+
33+
assert [item.instance_name for item in items] == [
34+
"cpu3g-2-8",
35+
"cpu3g-4-16",
36+
"cpu3g-8-32",
37+
"cpu3g-16-64",
38+
"cpu3g-32-128",
39+
]
40+
assert [item.cpu for item in items] == [2, 4, 8, 16, 32]
41+
assert [item.memory for item in items] == [8, 16, 32, 64, 128]
42+
assert [item.location for item in items] == ["AP-JP-1"] * 5
43+
assert [item.spot for item in items] == [False] * 5
44+
assert [item.gpu_count for item in items] == [0] * 5
45+
assert [item.flags for item in items] == [["runpod-cpu"]] * 5
46+
assert [item.disk_size for item in items] == [20.0, 40.0, 80.0, 160.0, 320.0]
47+
assert items[0].price == pytest.approx(0.08)
48+
assert items[-1].price == pytest.approx(1.28)
49+
assert items[0].provider_data == {}
50+
assert items[-1].provider_data == {}
51+
52+
53+
def test_make_cpu_catalog_items_skips_invalid_flavors():
54+
provider = object.__new__(RunpodProvider)
55+
cpu_flavors = [
56+
{
57+
"id": "cpu3c",
58+
"minVcpu": 2,
59+
"maxVcpu": 32,
60+
"ramMultiplier": 2,
61+
"diskLimitPerVcpu": 10,
62+
"specifics": {
63+
"stockStatus": None,
64+
"securePrice": 0.06,
65+
"slsPrice": 0.072,
66+
},
67+
},
68+
{
69+
"id": "cpu3m",
70+
"minVcpu": 2,
71+
"maxVcpu": 32,
72+
"ramMultiplier": 8,
73+
"diskLimitPerVcpu": 10,
74+
"specifics": {
75+
"stockStatus": "High",
76+
"securePrice": None,
77+
"slsPrice": 0,
78+
},
79+
},
80+
{
81+
"id": "cpu5c",
82+
"minVcpu": None,
83+
"maxVcpu": 32,
84+
"ramMultiplier": 2,
85+
"diskLimitPerVcpu": 15,
86+
"specifics": {
87+
"stockStatus": "High",
88+
"securePrice": 0.07,
89+
"slsPrice": 0.084,
90+
},
91+
},
92+
{
93+
"id": "cpu5m",
94+
"minVcpu": 16,
95+
"maxVcpu": 8,
96+
"ramMultiplier": 8,
97+
"diskLimitPerVcpu": 10,
98+
"specifics": {
99+
"stockStatus": "High",
100+
"securePrice": 0.13,
101+
"slsPrice": 0,
102+
},
103+
},
104+
{
105+
"id": "cpu5g",
106+
"minVcpu": 2,
107+
"maxVcpu": 32,
108+
"ramMultiplier": 4,
109+
"diskLimitPerVcpu": None,
110+
"specifics": {
111+
"stockStatus": "High",
112+
"securePrice": 0.08,
113+
"slsPrice": 0.1,
114+
},
115+
},
116+
{
117+
"id": "cpu5x",
118+
"minVcpu": 2,
119+
"maxVcpu": 32,
120+
"ramMultiplier": 4,
121+
"diskLimitPerVcpu": 0,
122+
"specifics": {
123+
"stockStatus": "High",
124+
"securePrice": 0.08,
125+
"slsPrice": 0.1,
126+
},
127+
},
128+
]
129+
130+
assert provider._make_cpu_catalog_items("AP-JP-1", cpu_flavors) == []
131+
132+
133+
def test_fetch_cpu_offers_handles_partial_datacenter_failures(monkeypatch):
134+
provider = object.__new__(RunpodProvider)
135+
136+
def fake_make_request(payload):
137+
assert payload["query"] == runpod_module.cpu_data_centers_query
138+
return {
139+
"data": {
140+
"dataCenters": [
141+
{"id": "US-IL-1", "listed": True},
142+
{"id": "AP-JP-1", "listed": True},
143+
{"id": "DC-SKIP", "listed": False},
144+
]
145+
}
146+
}
147+
148+
def fake_get_cpu_flavors(dc_id: str):
149+
if dc_id == "US-IL-1":
150+
raise RequestException("boom")
151+
return [
152+
{
153+
"id": "cpu3c",
154+
"minVcpu": 2,
155+
"maxVcpu": 32,
156+
"ramMultiplier": 2,
157+
"diskLimitPerVcpu": 10,
158+
"specifics": {
159+
"stockStatus": "High",
160+
"securePrice": 0.06,
161+
"slsPrice": 0.072,
162+
},
163+
}
164+
]
165+
166+
monkeypatch.setattr(runpod_module, "_make_request", fake_make_request)
167+
monkeypatch.setattr(provider, "_get_cpu_flavors", fake_get_cpu_flavors)
168+
169+
items = provider._fetch_cpu_offers()
170+
171+
assert len(items) == 5
172+
assert {item.location for item in items} == {"AP-JP-1"}
173+
assert all("-" in item.location for item in items)
174+
assert all(item.spot is False for item in items)
175+
assert {tuple(item.flags) for item in items} == {("runpod-cpu",)}
176+
assert {item.disk_size for item in items} == {20.0, 40.0, 80.0, 160.0, 320.0}
177+
assert {item.instance_name for item in items} == {
178+
"cpu3c-2-4",
179+
"cpu3c-4-8",
180+
"cpu3c-8-16",
181+
"cpu3c-16-32",
182+
"cpu3c-32-64",
183+
}
184+
185+
186+
def test_fetch_offers_appends_cpu_items(monkeypatch):
187+
provider = object.__new__(RunpodProvider)
188+
cpu_item = RawCatalogItem(
189+
instance_name="cpu3g-2-8",
190+
location="AP-JP-1",
191+
price=0.08,
192+
cpu=2,
193+
memory=8,
194+
gpu_count=0,
195+
gpu_name=None,
196+
gpu_memory=None,
197+
spot=False,
198+
disk_size=20.0,
199+
flags=["runpod-cpu"],
200+
provider_data={},
201+
)
202+
203+
monkeypatch.setattr(provider, "_build_query_variables", lambda: [])
204+
monkeypatch.setattr(provider, "_fetch_cluster_offers", lambda: [])
205+
monkeypatch.setattr(provider, "_fetch_cpu_offers", lambda: [cpu_item])
206+
207+
offers = provider._fetch_offers()
208+
209+
assert offers == [cpu_item]

0 commit comments

Comments
 (0)