Skip to content

Commit af32091

Browse files
committed
slop: fix bad model matching that led to always swapping
1 parent 83c1e77 commit af32091

3 files changed

Lines changed: 182 additions & 2 deletions

File tree

tools/server/server-context.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3213,6 +3213,10 @@ std::string server_context::get_current_model_name() const {
32133213
return impl->model_name;
32143214
}
32153215

3216+
std::string server_context::get_current_model_path() const {
3217+
return impl->params_base.model.path;
3218+
}
3219+
32163220
const server_chat_params& server_context::get_chat_params() const {
32173221
return impl->chat_params;
32183222
}
@@ -3593,10 +3597,11 @@ void server_routes::init_routes() {
35933597
// Helper: swap to a model if different from current
35943598
swap_if_needed_fn = [this](const std::string & requested_model) {
35953599
if (requested_model.empty() || !model_manager) return;
3596-
std::string cur = ctx_server_ref.get_current_model_name();
3597-
if (requested_model == cur) return;
35983600
auto meta_resolved = model_manager->get_meta(requested_model);
35993601
if (!meta_resolved.has_value()) return;
3602+
// Compare by model path instead of name, since the requested name
3603+
// could be an alias or differ from how model_name was derived at load time
3604+
if (meta_resolved->model_path == ctx_server_ref.get_current_model_path()) return;
36003605
common_params swap_params = params;
36013606
swap_params.model.path = meta_resolved->model_path;
36023607
SRV_INF("swapping to model '%s' (path: %s)\n", requested_model.c_str(), swap_params.model.path.c_str());

tools/server/server-context.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ struct server_context {
8181
// Get the currently loaded model's name
8282
std::string get_current_model_name() const;
8383

84+
// Get the currently loaded model's path
85+
std::string get_current_model_path() const;
86+
8487
// Get the currently loaded model's chat params
8588
const server_chat_params& get_chat_params() const;
8689

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
"""
2+
Test that multiple models can be loaded simultaneously when using the router.
3+
4+
Architecture overview:
5+
- Router mode (with '--'): Router process manages child processes, each child loads one model.
6+
Multiple models can be loaded simultaneously up to models_max (default 4).
7+
- Non-router mode (without '--'): Single process loads one model directly.
8+
No multi-model support.
9+
10+
This test verifies router behavior.
11+
"""
12+
13+
import pytest
14+
from utils import *
15+
16+
server: ServerProcess
17+
18+
19+
@pytest.fixture(autouse=True)
20+
def create_server():
21+
global server
22+
server = ServerPreset.router()
23+
24+
25+
def _get_model_status(model_id: str) -> str:
26+
res = server.make_request("GET", "/models")
27+
assert res.status_code == 200
28+
for item in res.body.get("data", []):
29+
if item.get("id") == model_id:
30+
return item["status"]["value"]
31+
raise AssertionError(f"Model {model_id} not found in /models response")
32+
33+
34+
def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str:
35+
deadline = time.time() + timeout
36+
last_status = None
37+
while time.time() < deadline:
38+
last_status = _get_model_status(model_id)
39+
if last_status in desired:
40+
return last_status
41+
time.sleep(1)
42+
raise AssertionError(
43+
f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}"
44+
)
45+
46+
47+
def _load_model_and_wait(
48+
model_id: str, timeout: int = 60, headers: dict | None = None
49+
) -> None:
50+
load_res = server.make_request(
51+
"POST", "/models/load", data={"model": model_id}, headers=headers
52+
)
53+
assert load_res.status_code == 200, f"Failed to load model {model_id}: {load_res.body}"
54+
assert isinstance(load_res.body, dict)
55+
assert load_res.body.get("success") is True
56+
_wait_for_model_status(model_id, {"loaded"}, timeout=timeout)
57+
58+
59+
def test_router_multiple_models_loaded_simultaneously():
60+
"""
61+
Test that two models can be loaded simultaneously when models_max >= 2.
62+
This verifies that the router does NOT unload the current model when loading a new one,
63+
unless models_max limit is reached.
64+
"""
65+
global server
66+
server.models_max = 4
67+
server.start()
68+
69+
# Get available models from /models endpoint
70+
res = server.make_request("GET", "/models")
71+
assert res.status_code == 200
72+
available = res.body.get("data", [])
73+
assert len(available) >= 1, f"No models available in /models response"
74+
75+
# Use the same model twice to verify simultaneous loading works
76+
model_id = available[0]["id"]
77+
78+
# Load the model
79+
_load_model_and_wait(model_id, timeout=120)
80+
assert _get_model_status(model_id) == "loaded"
81+
82+
# Load it again (should succeed since it's already loaded)
83+
load_res2 = server.make_request(
84+
"POST", "/models/load", data={"model": model_id}
85+
)
86+
assert load_res2.status_code == 200, f"Second load failed: {load_res2.body}"
87+
88+
# Model should still be loaded
89+
assert _get_model_status(model_id) == "loaded"
90+
91+
92+
def test_router_lru_eviction_when_models_max_reached():
93+
"""
94+
Test that LRU eviction occurs when models_max limit is reached.
95+
With models_max=2, loading a 3rd model should evict the least recently used.
96+
"""
97+
global server
98+
server.models_max = 2
99+
server.start()
100+
101+
# Get available models
102+
res = server.make_request("GET", "/models")
103+
assert res.status_code == 200
104+
available = res.body.get("data", [])
105+
assert len(available) >= 3, f"Need at least 3 models, found: {[m['id'] for m in available]}"
106+
107+
first = available[0]["id"]
108+
second = available[1]["id"]
109+
third = available[2]["id"]
110+
111+
# Load first two models
112+
_load_model_and_wait(first, timeout=120)
113+
_load_model_and_wait(second, timeout=120)
114+
115+
assert _get_model_status(first) == "loaded"
116+
assert _get_model_status(second) == "loaded"
117+
118+
# Load third model - should evict the first (LRU)
119+
_load_model_and_wait(third, timeout=120)
120+
121+
assert _get_model_status(third) == "loaded"
122+
# First model should have been evicted (LRU)
123+
assert _get_model_status(first) == "unloaded", \
124+
f"First model should have been evicted by LRU, status: {_get_model_status(first)}"
125+
# Second model should still be loaded
126+
assert _get_model_status(second) == "loaded", \
127+
f"Second model should still be loaded, status: {_get_model_status(second)}"
128+
129+
130+
def test_router_chat_completion_with_multiple_models():
131+
"""
132+
Test that chat completions work with different models when multiple are loaded.
133+
"""
134+
global server
135+
server.models_max = 4
136+
server.start()
137+
138+
# Get available models
139+
res = server.make_request("GET", "/models")
140+
assert res.status_code == 200
141+
available = res.body.get("data", [])
142+
assert len(available) >= 2, f"Need at least 2 models, found: {[m['id'] for m in available]}"
143+
144+
first_model = available[0]["id"]
145+
second_model = available[1]["id"]
146+
147+
# Load both models
148+
_load_model_and_wait(first_model, timeout=120)
149+
_load_model_and_wait(second_model, timeout=120)
150+
151+
# Verify both are loaded
152+
assert _get_model_status(first_model) == "loaded"
153+
assert _get_model_status(second_model) == "loaded"
154+
155+
# Make chat completion requests to both models
156+
res1 = server.make_request("POST", "/v1/chat/completions", data={
157+
"model": first_model,
158+
"messages": [{"role": "user", "content": "hello"}],
159+
"max_tokens": 4,
160+
})
161+
assert res1.status_code == 200, f"First model chat completion failed: {res1.body}"
162+
163+
res2 = server.make_request("POST", "/v1/chat/completions", data={
164+
"model": second_model,
165+
"messages": [{"role": "user", "content": "hello"}],
166+
"max_tokens": 4,
167+
})
168+
assert res2.status_code == 200, f"Second model chat completion failed: {res2.body}"
169+
170+
# Both models should still be loaded
171+
assert _get_model_status(first_model) == "loaded"
172+
assert _get_model_status(second_model) == "loaded"

0 commit comments

Comments
 (0)