fix: change default base URLs from localhost to 127.0.0.1 to prevent … (#7)

NullPointerDepressiveDisorder · web-flow · commit fbc417cf3423 · 2026-03-30T18:41:44.000-07:00
* fix: change default base URLs from localhost to 127.0.0.1 to prevent Windows IPv6 resolution issues, pass model_id to llama-cpp, and correct model path parsing in the sweep command.

* fix(llama_cpp): Correct model_id fallback and improve test robustness

* test: improve llama-cpp backend tests for cleanup and backend name handling
diff --git a/README.md b/README.md
@@ -106,7 +106,7 @@ Same model, same quant, different inference paths. Catches serving-layer bugs.
 infer-check diff \
   --model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
   --backends "mlx-lm,openai-compat" \
-  --base-urls ",http://localhost:8000" \
+  --base-urls ",http://127.0.0.1:8000" \
   --prompts reasoning \
   --output ./results/diff/
 ```
@@ -134,7 +134,7 @@ Concurrent requests through a serving backend. Tests KV cache correctness under
 infer-check stress \
   --model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
   --backend openai-compat \
-  --base-url http://localhost:8000 \
+  --base-url http://127.0.0.1:8000 \
   --prompts reasoning \
   --concurrency 1,2,4,8 \
   --output ./results/stress/
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ dev = [
     "infer-check[all]",
     "mypy>=1.19.1",
     "pytest>=9.0.2",
+    "pytest-asyncio>=1.3.0",
     "pytest-cov>=7.0.0",
     "ruff>=0.15.5",
 ]
@@ -58,6 +59,7 @@ dev = [
     "mypy>=1.19.1",
     "pre-commit>=4.5.1",
     "pytest>=9.0.2",
+    "pytest-asyncio>=1.3.0",
     "pytest-cov>=7.0.0",
     "ruff>=0.15.5",
 ]
diff --git a/src/infer_check/backends/base.py b/src/infer_check/backends/base.py
@@ -57,12 +57,12 @@ def get_backend(config: BackendConfig) -> BackendAdapter:
     elif config.backend_type == "llama-cpp":
         from infer_check.backends.llama_cpp import LlamaCppBackend
 
-        url = config.base_url or "http://localhost:8080"
-        return LlamaCppBackend(base_url=url)
+        url = config.base_url or "http://127.0.0.1:8080"
+        return LlamaCppBackend(model_id=config.model_id, base_url=url)
     elif config.backend_type == "vllm-mlx":
         from infer_check.backends.vllm_mlx import VLLMMLXBackend
 
-        url = config.base_url or "http://localhost:8000"
+        url = config.base_url or "http://127.0.0.1:8000"
         return VLLMMLXBackend(
             model_id=config.model_id,
             base_url=url,
@@ -73,7 +73,7 @@ def get_backend(config: BackendConfig) -> BackendAdapter:
 
         if not config.base_url:
             raise ValueError(
-                "openai-compat backend requires --base-url. Example: --base-url http://localhost:11434/v1 (Ollama)"
+                "openai-compat backend requires --base-url. Example: --base-url http://127.0.0.1:11434/v1 (Ollama)"
             )
         return OpenAICompatBackend(
             base_url=config.base_url,
diff --git a/src/infer_check/backends/llama_cpp.py b/src/infer_check/backends/llama_cpp.py
@@ -19,7 +19,8 @@ class LlamaCppBackend:
     Communicates via the ``/completion`` endpoint.
     """
 
-    def __init__(self, base_url: str = "http://localhost:8080") -> None:
+    def __init__(self, model_id: str, base_url: str = "http://127.0.0.1:8080") -> None:
+        self._model_id = model_id
         self._base_url = base_url.rstrip("/")
         self._client = httpx.AsyncClient(base_url=self._base_url, timeout=120.0)
 
@@ -34,6 +35,7 @@ def name(self) -> str:
     async def generate(self, prompt: Prompt) -> InferenceResult:
         """Send a completion request and parse the response."""
         payload = {
+            "model": self._model_id,
             "prompt": prompt.text,
             "n_predict": prompt.max_tokens,
             "temperature": prompt.metadata.get("temperature", 0.0) if prompt.metadata else 0.0,
@@ -45,10 +47,16 @@ async def generate(self, prompt: Prompt) -> InferenceResult:
             response = await self._client.post("/completion", json=payload)
             response.raise_for_status()
         except httpx.ConnectError as exc:
+            # On Windows, localhost often resolves to IPv6 [::1] which many servers don't bind to.
+            # Using 127.0.0.1 (IPv4) is generally more reliable for local connections.
+            extra_hint = ""
+            if "localhost" in self._base_url:
+                extra_hint = "\nHint: Try using 127.0.0.1 instead of localhost on Windows."
+
             raise RuntimeError(
                 f"Cannot connect to llama-server at {self._base_url}. "
-                "Start it with: llama-server -m <model.gguf> --port 8080\n"
-                "Or use Ollama: ollama serve"
+                f"Start it with: llama-server -m <model.gguf> --port 8080\n"
+                f"Or use Ollama: ollama serve{extra_hint}"
             ) from exc
         except httpx.TimeoutException as exc:
             raise RuntimeError(
@@ -116,7 +124,7 @@ async def generate(self, prompt: Prompt) -> InferenceResult:
         return InferenceResult(
             prompt_id=prompt.id,
             backend_name=self.name,
-            model_id=data.get("model", "unknown"),
+            model_id=data.get("model", self._model_id),
             tokens=tokens,
             logprobs=logprobs,
             distributions=distributions,
diff --git a/src/infer_check/backends/vllm_mlx.py b/src/infer_check/backends/vllm_mlx.py
@@ -20,7 +20,7 @@ class VLLMMLXBackend(OpenAICompatBackend):
     def __init__(
         self,
         model_id: str,
-        base_url: str = "http://localhost:8000",
+        base_url: str = "http://127.0.0.1:8000",
         api_key: str | None = None,
         chat: bool = False,
     ) -> None:
@@ -53,7 +53,7 @@ def from_model(
         cls,
         model_id: str,
         quantization: str | None = None,
-        base_url: str = "http://localhost:8000",
+        base_url: str = "http://127.0.0.1:8000",
     ) -> VLLMMLXBackend:
         """Create a backend for *model_id*.
 
@@ -70,7 +70,7 @@ def from_model(
         Args:
             model_id: HuggingFace model identifier.
             quantization: Optional quantization string (e.g. ``"4bit"``).
-            base_url: Server URL (default ``http://localhost:8000``).
+            base_url: Server URL (default ``http://127.0.0.1:8000``).
 
         Returns:
             A configured :class:`VLLMMLXBackend` instance.
diff --git a/src/infer_check/cli.py b/src/infer_check/cli.py
@@ -96,7 +96,7 @@ def sweep(
         entry = entry.strip()
         if "=" in entry:
             label, path = entry.split("=", 1)
-            model_map[label.strip()] = path.strip()
+            model_map[label.strip()] = path.strip().lstrip("=").strip()
         else:
             # No label provided — use the last path component as label
             label = entry.strip().rsplit("/", 1)[-1]
diff --git a/src/infer_check/resolve.py b/src/infer_check/resolve.py
@@ -9,7 +9,7 @@
   3. Explicit prefix  — ``gguf:/path/to/model.gguf`` → llama-cpp
   4. Local .gguf file — path exists and ends with ``.gguf`` → llama-cpp
   5. HF repo with ``-mlx`` or ``mlx-community/`` → mlx-lm
-  6. HF repo with ``-GGUF`` or ``-gguf`` → llama-cpp (default: http://localhost:8080)
+  6. HF repo with ``-GGUF`` or ``-gguf`` → llama-cpp (default: http://127.0.0.1:8080)
   7. Fallback — assume mlx-lm (most common local Mac use case)
 """
 
@@ -34,9 +34,9 @@
 
 # Default base URLs per backend (can be overridden via CLI).
 _DEFAULT_URLS: dict[BackendType, str] = {
-    "openai-compat": "http://localhost:11434/v1",  # Ollama
-    "llama-cpp": "http://localhost:8080",
-    "vllm-mlx": "http://localhost:8000",
+    "openai-compat": "http://127.0.0.1:11434/v1",  # Ollama
+    "llama-cpp": "http://127.0.0.1:8080",
+    "vllm-mlx": "http://127.0.0.1:8000",
 }
 
 
diff --git a/tests/unit/test_cli_parsing.py b/tests/unit/test_cli_parsing.py
@@ -0,0 +1,55 @@
+from datetime import UTC, datetime
+from pathlib import Path
+from unittest.mock import MagicMock, Mock, patch
+
+from infer_check.types import SweepResult
+
+
+def test_sweep_model_parsing_robustness() -> None:
+    """Test that sweep command parses model paths robustly, handling extra equals signs."""
+    # Create a mock SweepResult to return from runner.sweep
+    mock_sweep_result = SweepResult(
+        model_id="test-model",
+        backend_name="test-backend",
+        quantization_levels=["bf16", "4bit"],
+        comparisons=[],
+        timestamp=datetime.now(UTC),
+        summary={},
+    )
+
+    # We mock get_backend_for_model and TestRunner.sweep to avoid actual initialization
+    with (
+        patch("infer_check.backends.base.get_backend_for_model") as mock_get_backend,
+        patch("infer_check.runner.TestRunner.sweep", new_callable=Mock),
+        patch("infer_check.suites.loader.load_suite", return_value=[MagicMock()]),
+        patch("infer_check.cli._resolve_prompts", return_value=Path("dummy.jsonl")),
+        patch("asyncio.run", return_value=mock_sweep_result),
+    ):
+        mock_get_backend.return_value.name = "test-backend"
+        # Simulating the command: infer-check sweep --models "bf16==path/to/model" --prompts dummy
+        # We call the function directly as click command
+        from click.testing import CliRunner
+
+        from infer_check.cli import main
+
+        runner = CliRunner()
+        # Using a subset of arguments to trigger the parsing logic
+        with runner.isolated_filesystem():
+            result = runner.invoke(
+                main, ["sweep", "--models", "bf16==bartowski/Qwen,4bit=bartowski/Qwen", "--prompts", "reasoning"]
+            )
+        assert result.exit_code == 0, result.output
+
+        # Check if get_backend_for_model was called with cleaned paths
+        # It should be called twice: once for bf16 and once for 4bit
+        assert mock_get_backend.call_count == 2
+
+        # Check first call (bf16)
+        args, kwargs = mock_get_backend.call_args_list[0]
+        assert kwargs["model_str"] == "bartowski/Qwen"
+        assert kwargs["quantization"] == "bf16"
+
+        # Check second call (4bit)
+        args, kwargs = mock_get_backend.call_args_list[1]
+        assert kwargs["model_str"] == "bartowski/Qwen"
+        assert kwargs["quantization"] == "4bit"
diff --git a/tests/unit/test_llama_cpp_fallback.py b/tests/unit/test_llama_cpp_fallback.py
@@ -0,0 +1,30 @@
+from unittest.mock import patch
+
+import httpx
+import pytest
+
+from infer_check.backends.llama_cpp import LlamaCppBackend
+from infer_check.types import Prompt
+
+
+@pytest.mark.asyncio
+async def test_llama_cpp_model_id_fallback() -> None:
+    model_id = "test-model-gguf"
+    backend = LlamaCppBackend(model_id=model_id, base_url="http://127.0.0.1:8080")
+    prompt = Prompt(id="p1", text="Hello", max_tokens=10)
+
+    # Response missing "model" field
+    mock_response = httpx.Response(
+        200,
+        json={"content": " world", "timings": {"predicted_per_second": 10.0}},
+        request=httpx.Request("POST", "http://127.0.0.1:8080/completion"),
+    )
+
+    try:
+        with patch("httpx.AsyncClient.post", return_value=mock_response):
+            res = await backend.generate(prompt)
+
+            # Verify it falls back to backend's model_id instead of "unknown"
+            assert res.model_id == model_id
+    finally:
+        await backend.cleanup()
diff --git a/tests/unit/test_llama_cpp_payload.py b/tests/unit/test_llama_cpp_payload.py
@@ -0,0 +1,39 @@
+from unittest.mock import patch
+
+import httpx
+import pytest
+
+from infer_check.backends.llama_cpp import LlamaCppBackend
+from infer_check.types import Prompt
+
+
+@pytest.mark.asyncio
+async def test_llama_cpp_includes_model_in_payload() -> None:
+    model_id = "test-model-gguf"
+    backend = LlamaCppBackend(model_id=model_id, base_url="http://127.0.0.1:8080")
+    prompt = Prompt(id="p1", text="Hello", max_tokens=10)
+
+    mock_response = httpx.Response(
+        200,
+        json={"content": " world", "model": model_id, "timings": {"predicted_per_second": 10.0}},
+        request=httpx.Request("POST", "http://127.0.0.1:8080/completion"),
+    )
+
+    try:
+        with patch("httpx.AsyncClient.post", return_value=mock_response) as mock_post:
+            res = await backend.generate(prompt)
+
+            # Verify the call to post
+            assert mock_post.called
+            args, kwargs = mock_post.call_args
+            assert args[0] == "/completion"
+            payload = kwargs["json"]
+            assert payload["model"] == model_id
+            assert payload["prompt"] == "Hello"
+            assert payload["n_predict"] == 10
+
+            # Verify result
+            assert res.text == " world"
+            assert res.model_id == model_id
+    finally:
+        await backend.cleanup()
diff --git a/tests/unit/test_openai_compat.py b/tests/unit/test_openai_compat.py
@@ -9,13 +9,13 @@
 
 
 def test_generate_chat_success() -> None:
-    backend = OpenAICompatBackend(base_url="http://localhost:8000", model_id="dummy", chat=True)
+    backend = OpenAICompatBackend(base_url="http://127.0.0.1:8000", model_id="dummy", chat=True)
     prompt = Prompt(id="p1", text="Hello", max_tokens=10)
 
     mock_response = httpx.Response(
         200,
         json={"choices": [{"message": {"content": "world"}}], "usage": {"completion_tokens": 1}},
-        request=httpx.Request("POST", "http://localhost:8000/v1/chat/completions"),
+        request=httpx.Request("POST", "http://127.0.0.1:8000/v1/chat/completions"),
     )
 
     with patch("httpx.AsyncClient.post", return_value=mock_response):
@@ -26,7 +26,7 @@ def test_generate_chat_success() -> None:
 
 
 def test_generate_chat_connection_refused() -> None:
-    backend = OpenAICompatBackend(base_url="http://localhost:8000", model_id="dummy", chat=True)
+    backend = OpenAICompatBackend(base_url="http://127.0.0.1:8000", model_id="dummy", chat=True)
     prompt = Prompt(id="p1", text="Hello", max_tokens=10)
 
     with patch("httpx.AsyncClient.post", side_effect=httpx.ConnectError("Connection refused")):
@@ -36,7 +36,7 @@ def test_generate_chat_connection_refused() -> None:
 
 
 def test_generate_chat_timeout() -> None:
-    backend = OpenAICompatBackend(base_url="http://localhost:8000", model_id="dummy", chat=True)
+    backend = OpenAICompatBackend(base_url="http://127.0.0.1:8000", model_id="dummy", chat=True)
     prompt = Prompt(id="p1", text="Hello", max_tokens=10)
 
     with patch("httpx.AsyncClient.post", side_effect=httpx.TimeoutException("Timeout")):
@@ -46,10 +46,10 @@ def test_generate_chat_timeout() -> None:
 
 
 def test_generate_completions_404() -> None:
-    backend = OpenAICompatBackend(base_url="http://localhost:8000", model_id="dummy", chat=False)
+    backend = OpenAICompatBackend(base_url="http://127.0.0.1:8000", model_id="dummy", chat=False)
     prompt = Prompt(id="p1", text="Hello", max_tokens=10)
 
-    mock_response = httpx.Response(404, request=httpx.Request("POST", "http://localhost:8000/v1/completions"))
+    mock_response = httpx.Response(404, request=httpx.Request("POST", "http://127.0.0.1:8000/v1/completions"))
     with patch(
         "httpx.AsyncClient.post",
         side_effect=httpx.HTTPStatusError("404 Not Found", request=mock_response.request, response=mock_response),
@@ -60,13 +60,13 @@ def test_generate_completions_404() -> None:
 
 
 def test_generate_chat_malformed_json() -> None:
-    backend = OpenAICompatBackend(base_url="http://localhost:8000", model_id="dummy", chat=True)
+    backend = OpenAICompatBackend(base_url="http://127.0.0.1:8000", model_id="dummy", chat=True)
     prompt = Prompt(id="p1", text="Hello", max_tokens=10)
 
     mock_response = httpx.Response(
         200,
         text="Not valid JSON",
-        request=httpx.Request("POST", "http://localhost:8000/v1/chat/completions"),
+        request=httpx.Request("POST", "http://127.0.0.1:8000/v1/chat/completions"),
     )
     with patch("httpx.AsyncClient.post", return_value=mock_response):
         with pytest.raises(RuntimeError) as exc:
@@ -75,13 +75,13 @@ def test_generate_chat_malformed_json() -> None:
 
 
 def test_generate_empty_choices() -> None:
-    backend = OpenAICompatBackend(base_url="http://localhost:8000", model_id="dummy", chat=True)
+    backend = OpenAICompatBackend(base_url="http://127.0.0.1:8000", model_id="dummy", chat=True)
     prompt = Prompt(id="p1", text="Hello", max_tokens=10)
 
     mock_response = httpx.Response(
         200,
         json={"choices": []},
-        request=httpx.Request("POST", "http://localhost:8000/v1/chat/completions"),
+        request=httpx.Request("POST", "http://127.0.0.1:8000/v1/chat/completions"),
     )
     with patch("httpx.AsyncClient.post", return_value=mock_response):
         with pytest.raises(RuntimeError) as exc:
diff --git a/tests/unit/test_resolve.py b/tests/unit/test_resolve.py