Skip to content

Commit fbc417c

Browse files
fix: change default base URLs from localhost to 127.0.0.1 to prevent … (#7)
* fix: change default base URLs from localhost to 127.0.0.1 to prevent Windows IPv6 resolution issues, pass model_id to llama-cpp, and correct model path parsing in the sweep command. * fix(llama_cpp): Correct model_id fallback and improve test robustness * test: improve llama-cpp backend tests for cleanup and backend name handling
1 parent c2822a9 commit fbc417c

12 files changed

Lines changed: 166 additions & 32 deletions

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ Same model, same quant, different inference paths. Catches serving-layer bugs.
106106
infer-check diff \
107107
--model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
108108
--backends "mlx-lm,openai-compat" \
109-
--base-urls ",http://localhost:8000" \
109+
--base-urls ",http://127.0.0.1:8000" \
110110
--prompts reasoning \
111111
--output ./results/diff/
112112
```
@@ -134,7 +134,7 @@ Concurrent requests through a serving backend. Tests KV cache correctness under
134134
infer-check stress \
135135
--model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
136136
--backend openai-compat \
137-
--base-url http://localhost:8000 \
137+
--base-url http://127.0.0.1:8000 \
138138
--prompts reasoning \
139139
--concurrency 1,2,4,8 \
140140
--output ./results/stress/

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ dev = [
3333
"infer-check[all]",
3434
"mypy>=1.19.1",
3535
"pytest>=9.0.2",
36+
"pytest-asyncio>=1.3.0",
3637
"pytest-cov>=7.0.0",
3738
"ruff>=0.15.5",
3839
]
@@ -58,6 +59,7 @@ dev = [
5859
"mypy>=1.19.1",
5960
"pre-commit>=4.5.1",
6061
"pytest>=9.0.2",
62+
"pytest-asyncio>=1.3.0",
6163
"pytest-cov>=7.0.0",
6264
"ruff>=0.15.5",
6365
]

src/infer_check/backends/base.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,12 @@ def get_backend(config: BackendConfig) -> BackendAdapter:
5757
elif config.backend_type == "llama-cpp":
5858
from infer_check.backends.llama_cpp import LlamaCppBackend
5959

60-
url = config.base_url or "http://localhost:8080"
61-
return LlamaCppBackend(base_url=url)
60+
url = config.base_url or "http://127.0.0.1:8080"
61+
return LlamaCppBackend(model_id=config.model_id, base_url=url)
6262
elif config.backend_type == "vllm-mlx":
6363
from infer_check.backends.vllm_mlx import VLLMMLXBackend
6464

65-
url = config.base_url or "http://localhost:8000"
65+
url = config.base_url or "http://127.0.0.1:8000"
6666
return VLLMMLXBackend(
6767
model_id=config.model_id,
6868
base_url=url,
@@ -73,7 +73,7 @@ def get_backend(config: BackendConfig) -> BackendAdapter:
7373

7474
if not config.base_url:
7575
raise ValueError(
76-
"openai-compat backend requires --base-url. Example: --base-url http://localhost:11434/v1 (Ollama)"
76+
"openai-compat backend requires --base-url. Example: --base-url http://127.0.0.1:11434/v1 (Ollama)"
7777
)
7878
return OpenAICompatBackend(
7979
base_url=config.base_url,

src/infer_check/backends/llama_cpp.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ class LlamaCppBackend:
1919
Communicates via the ``/completion`` endpoint.
2020
"""
2121

22-
def __init__(self, base_url: str = "http://localhost:8080") -> None:
22+
def __init__(self, model_id: str, base_url: str = "http://127.0.0.1:8080") -> None:
23+
self._model_id = model_id
2324
self._base_url = base_url.rstrip("/")
2425
self._client = httpx.AsyncClient(base_url=self._base_url, timeout=120.0)
2526

@@ -34,6 +35,7 @@ def name(self) -> str:
3435
async def generate(self, prompt: Prompt) -> InferenceResult:
3536
"""Send a completion request and parse the response."""
3637
payload = {
38+
"model": self._model_id,
3739
"prompt": prompt.text,
3840
"n_predict": prompt.max_tokens,
3941
"temperature": prompt.metadata.get("temperature", 0.0) if prompt.metadata else 0.0,
@@ -45,10 +47,16 @@ async def generate(self, prompt: Prompt) -> InferenceResult:
4547
response = await self._client.post("/completion", json=payload)
4648
response.raise_for_status()
4749
except httpx.ConnectError as exc:
50+
# On Windows, localhost often resolves to IPv6 [::1] which many servers don't bind to.
51+
# Using 127.0.0.1 (IPv4) is generally more reliable for local connections.
52+
extra_hint = ""
53+
if "localhost" in self._base_url:
54+
extra_hint = "\nHint: Try using 127.0.0.1 instead of localhost on Windows."
55+
4856
raise RuntimeError(
4957
f"Cannot connect to llama-server at {self._base_url}. "
50-
"Start it with: llama-server -m <model.gguf> --port 8080\n"
51-
"Or use Ollama: ollama serve"
58+
f"Start it with: llama-server -m <model.gguf> --port 8080\n"
59+
f"Or use Ollama: ollama serve{extra_hint}"
5260
) from exc
5361
except httpx.TimeoutException as exc:
5462
raise RuntimeError(
@@ -116,7 +124,7 @@ async def generate(self, prompt: Prompt) -> InferenceResult:
116124
return InferenceResult(
117125
prompt_id=prompt.id,
118126
backend_name=self.name,
119-
model_id=data.get("model", "unknown"),
127+
model_id=data.get("model", self._model_id),
120128
tokens=tokens,
121129
logprobs=logprobs,
122130
distributions=distributions,

src/infer_check/backends/vllm_mlx.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class VLLMMLXBackend(OpenAICompatBackend):
2020
def __init__(
2121
self,
2222
model_id: str,
23-
base_url: str = "http://localhost:8000",
23+
base_url: str = "http://127.0.0.1:8000",
2424
api_key: str | None = None,
2525
chat: bool = False,
2626
) -> None:
@@ -53,7 +53,7 @@ def from_model(
5353
cls,
5454
model_id: str,
5555
quantization: str | None = None,
56-
base_url: str = "http://localhost:8000",
56+
base_url: str = "http://127.0.0.1:8000",
5757
) -> VLLMMLXBackend:
5858
"""Create a backend for *model_id*.
5959
@@ -70,7 +70,7 @@ def from_model(
7070
Args:
7171
model_id: HuggingFace model identifier.
7272
quantization: Optional quantization string (e.g. ``"4bit"``).
73-
base_url: Server URL (default ``http://localhost:8000``).
73+
base_url: Server URL (default ``http://127.0.0.1:8000``).
7474
7575
Returns:
7676
A configured :class:`VLLMMLXBackend` instance.

src/infer_check/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def sweep(
9696
entry = entry.strip()
9797
if "=" in entry:
9898
label, path = entry.split("=", 1)
99-
model_map[label.strip()] = path.strip()
99+
model_map[label.strip()] = path.strip().lstrip("=").strip()
100100
else:
101101
# No label provided — use the last path component as label
102102
label = entry.strip().rsplit("/", 1)[-1]

src/infer_check/resolve.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
3. Explicit prefix — ``gguf:/path/to/model.gguf`` → llama-cpp
1010
4. Local .gguf file — path exists and ends with ``.gguf`` → llama-cpp
1111
5. HF repo with ``-mlx`` or ``mlx-community/`` → mlx-lm
12-
6. HF repo with ``-GGUF`` or ``-gguf`` → llama-cpp (default: http://localhost:8080)
12+
6. HF repo with ``-GGUF`` or ``-gguf`` → llama-cpp (default: http://127.0.0.1:8080)
1313
7. Fallback — assume mlx-lm (most common local Mac use case)
1414
"""
1515

@@ -34,9 +34,9 @@
3434

3535
# Default base URLs per backend (can be overridden via CLI).
3636
_DEFAULT_URLS: dict[BackendType, str] = {
37-
"openai-compat": "http://localhost:11434/v1", # Ollama
38-
"llama-cpp": "http://localhost:8080",
39-
"vllm-mlx": "http://localhost:8000",
37+
"openai-compat": "http://127.0.0.1:11434/v1", # Ollama
38+
"llama-cpp": "http://127.0.0.1:8080",
39+
"vllm-mlx": "http://127.0.0.1:8000",
4040
}
4141

4242

tests/unit/test_cli_parsing.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from datetime import UTC, datetime
2+
from pathlib import Path
3+
from unittest.mock import MagicMock, Mock, patch
4+
5+
from infer_check.types import SweepResult
6+
7+
8+
def test_sweep_model_parsing_robustness() -> None:
9+
"""Test that sweep command parses model paths robustly, handling extra equals signs."""
10+
# Create a mock SweepResult to return from runner.sweep
11+
mock_sweep_result = SweepResult(
12+
model_id="test-model",
13+
backend_name="test-backend",
14+
quantization_levels=["bf16", "4bit"],
15+
comparisons=[],
16+
timestamp=datetime.now(UTC),
17+
summary={},
18+
)
19+
20+
# We mock get_backend_for_model and TestRunner.sweep to avoid actual initialization
21+
with (
22+
patch("infer_check.backends.base.get_backend_for_model") as mock_get_backend,
23+
patch("infer_check.runner.TestRunner.sweep", new_callable=Mock),
24+
patch("infer_check.suites.loader.load_suite", return_value=[MagicMock()]),
25+
patch("infer_check.cli._resolve_prompts", return_value=Path("dummy.jsonl")),
26+
patch("asyncio.run", return_value=mock_sweep_result),
27+
):
28+
mock_get_backend.return_value.name = "test-backend"
29+
# Simulating the command: infer-check sweep --models "bf16==path/to/model" --prompts dummy
30+
# We call the function directly as click command
31+
from click.testing import CliRunner
32+
33+
from infer_check.cli import main
34+
35+
runner = CliRunner()
36+
# Using a subset of arguments to trigger the parsing logic
37+
with runner.isolated_filesystem():
38+
result = runner.invoke(
39+
main, ["sweep", "--models", "bf16==bartowski/Qwen,4bit=bartowski/Qwen", "--prompts", "reasoning"]
40+
)
41+
assert result.exit_code == 0, result.output
42+
43+
# Check if get_backend_for_model was called with cleaned paths
44+
# It should be called twice: once for bf16 and once for 4bit
45+
assert mock_get_backend.call_count == 2
46+
47+
# Check first call (bf16)
48+
args, kwargs = mock_get_backend.call_args_list[0]
49+
assert kwargs["model_str"] == "bartowski/Qwen"
50+
assert kwargs["quantization"] == "bf16"
51+
52+
# Check second call (4bit)
53+
args, kwargs = mock_get_backend.call_args_list[1]
54+
assert kwargs["model_str"] == "bartowski/Qwen"
55+
assert kwargs["quantization"] == "4bit"
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from unittest.mock import patch
2+
3+
import httpx
4+
import pytest
5+
6+
from infer_check.backends.llama_cpp import LlamaCppBackend
7+
from infer_check.types import Prompt
8+
9+
10+
@pytest.mark.asyncio
11+
async def test_llama_cpp_model_id_fallback() -> None:
12+
model_id = "test-model-gguf"
13+
backend = LlamaCppBackend(model_id=model_id, base_url="http://127.0.0.1:8080")
14+
prompt = Prompt(id="p1", text="Hello", max_tokens=10)
15+
16+
# Response missing "model" field
17+
mock_response = httpx.Response(
18+
200,
19+
json={"content": " world", "timings": {"predicted_per_second": 10.0}},
20+
request=httpx.Request("POST", "http://127.0.0.1:8080/completion"),
21+
)
22+
23+
try:
24+
with patch("httpx.AsyncClient.post", return_value=mock_response):
25+
res = await backend.generate(prompt)
26+
27+
# Verify it falls back to backend's model_id instead of "unknown"
28+
assert res.model_id == model_id
29+
finally:
30+
await backend.cleanup()
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from unittest.mock import patch
2+
3+
import httpx
4+
import pytest
5+
6+
from infer_check.backends.llama_cpp import LlamaCppBackend
7+
from infer_check.types import Prompt
8+
9+
10+
@pytest.mark.asyncio
11+
async def test_llama_cpp_includes_model_in_payload() -> None:
12+
model_id = "test-model-gguf"
13+
backend = LlamaCppBackend(model_id=model_id, base_url="http://127.0.0.1:8080")
14+
prompt = Prompt(id="p1", text="Hello", max_tokens=10)
15+
16+
mock_response = httpx.Response(
17+
200,
18+
json={"content": " world", "model": model_id, "timings": {"predicted_per_second": 10.0}},
19+
request=httpx.Request("POST", "http://127.0.0.1:8080/completion"),
20+
)
21+
22+
try:
23+
with patch("httpx.AsyncClient.post", return_value=mock_response) as mock_post:
24+
res = await backend.generate(prompt)
25+
26+
# Verify the call to post
27+
assert mock_post.called
28+
args, kwargs = mock_post.call_args
29+
assert args[0] == "/completion"
30+
payload = kwargs["json"]
31+
assert payload["model"] == model_id
32+
assert payload["prompt"] == "Hello"
33+
assert payload["n_predict"] == 10
34+
35+
# Verify result
36+
assert res.text == " world"
37+
assert res.model_id == model_id
38+
finally:
39+
await backend.cleanup()

0 commit comments

Comments
 (0)