From 261e1f8f08661eae70dc3737c6d49cd4e3c17e84 Mon Sep 17 00:00:00 2001
From: Elias <eliasmiguel.leal@allianz.com>
Date: Tue, 5 May 2026 18:39:47 +0200
Subject: [PATCH 1/3] preparing for PR

---
 OLLAMA_NEXT_STEPS.md            | 128 +++++++++++++++++++++
 lmms_eval/models/__init__.py    |   1 +
 lmms_eval/models/chat/ollama.py | 107 ++++++++++++++++++
 test/models/test_ollama.py      | 192 ++++++++++++++++++++++++++++++++
 4 files changed, 428 insertions(+)
 create mode 100644 OLLAMA_NEXT_STEPS.md
 create mode 100644 lmms_eval/models/chat/ollama.py
 create mode 100644 test/models/test_ollama.py
diff --git a/OLLAMA_NEXT_STEPS.md b/OLLAMA_NEXT_STEPS.md
new file mode 100644
index 000000000..7768bcc61
--- /dev/null
+++ b/OLLAMA_NEXT_STEPS.md
@@ -0,0 +1,128 @@
+# Ollama Backend — Next Steps
+
+## Where we left off
+
+Branch: `feat/ollama-model`
+
+Two files were added:
+
+- `lmms_eval/models/chat/ollama.py` — the backend
+- `test/models/test_ollama.py` — unit tests (10/10 passing, no Ollama required)
+
+One line was added to `lmms_eval/models/__init__.py`:
+```python
+AVAILABLE_CHAT_TEMPLATE_MODELS = {
+    "ollama": "Ollama",   # ← added
+    ...
+}
+```
+
+---
+
+## What still needs to be done
+
+### 1. Install Ollama and pull a vision model
+
+```bash
+# Install from https://ollama.com
+ollama serve                  # start the server (runs on localhost:11434)
+ollama pull llava             # smallest vision model, good for smoke testing
+ollama pull llava-llama3      # better quality if you have the VRAM
+```
+
+### 2. Run a live smoke test
+
+```bash
+uv run python -m lmms_eval \
+    --model ollama \
+    --model_args model_version=llava \
+    --tasks mme \
+    --limit 8
+```
+
+Expected: 8 samples evaluated, scores printed. If it errors, check:
+- Is `ollama serve` running?
+- Does `ollama list` show `llava`?
+- Is the `logprobs` field actually present in `/api/generate` responses for your Ollama version?
+
+### 3. Verify loglikelihood against the real API
+
+The `loglikelihood` implementation uses `POST /api/generate` with `logprobs=True`.
+This field was added in Ollama v0.1.38. Confirm it works:
+
+```bash
+curl http://localhost:11434/api/generate -d '{
+  "model": "llava",
+  "prompt": "The sky is blue",
+  "stream": false,
+  "logprobs": true
+}'
+```
+
+The response should contain a `"logprobs"` array of floats. If it's missing or null,
+the implementation will silently return `-inf` for all loglikelihood requests — tasks
+that depend on it (e.g. multiple-choice scoring) will produce wrong results.
+
+### 4. Run pre-commit linting (required before PR)
+
+```bash
+uv run pip install pre-commit
+uv run pre-commit install
+uv run pre-commit run --all-files
+```
+
+This runs Black (line length 240) + isort. Fix any formatting issues it flags.
+
+### 5. Commit
+
+```bash
+git add lmms_eval/models/chat/ollama.py \
+        lmms_eval/models/__init__.py \
+        test/models/test_ollama.py
+git commit -m "feat: add Ollama local inference backend"
+```
+
+### 6. Open a pull request
+
+The upstream repo is `EvolvingLMMs-Lab/lmms-eval`. You'll need to fork it if you
+haven't already, push the branch, and open a PR against `main`.
+
+PR description should include:
+- What model this adds and why (local inference, no API key, multimodal)
+- Supported Ollama models (llava, llava-llama3, moondream, minicpm-v, ...)
+- Known limitations (no video/audio, logprobs requires Ollama ≥ v0.1.38)
+- Example usage command
+
+---
+
+## Known limitations to mention in the PR
+
+| Feature | Status |
+|---|---|
+| Image input | Works (base64 via OpenAI-compat /v1) |
+| Text-only models | Works |
+| Video input | Not supported by Ollama |
+| Audio input | Not supported by Ollama |
+| `loglikelihood` | Works if Ollama ≥ v0.1.38 and model supports it |
+| `generate_until_multi_round` | Inherited from OpenAI parent (raises NotImplementedError) |
+
+---
+
+## Quick reference
+
+```bash
+# Set up on new machine
+git clone <your-fork>
+cd lmms-eval
+git checkout feat/ollama-model
+uv sync
+
+# Run unit tests (no Ollama needed)
+uv run python -m pytest test/models/test_ollama.py -v
+
+# Run live eval (Ollama must be running)
+uv run python -m lmms_eval \
+    --model ollama \
+    --model_args model_version=llava \
+    --tasks mme --limit 8
+```
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
index a7c1f92e5..25d71a3b4 100644
--- a/lmms_eval/models/__init__.py
+++ b/lmms_eval/models/__init__.py
@@ -114,6 +114,7 @@
 
 AVAILABLE_CHAT_TEMPLATE_MODELS = {
     "bagel_lmms_engine": "BagelLmmsEngine",
+    "ollama": "Ollama",
     "fastvideo": "FastVideo",
     "internvl_hf": "InternVLHf",
     "llava_hf": "LlavaHf",
diff --git a/lmms_eval/models/chat/ollama.py b/lmms_eval/models/chat/ollama.py
new file mode 100644
index 000000000..f2f09f24b
--- /dev/null
+++ b/lmms_eval/models/chat/ollama.py
@@ -0,0 +1,107 @@
+"""Ollama chat backend.
+
+Ollama exposes an OpenAI-compatible API at http://localhost:11434/v1, so this
+backend inherits the full OpenAI chat implementation and overrides only
+loglikelihood, which Ollama supports via the /api/generate endpoint with
+``logprobs=True``.
+
+Example usage::
+
+    python -m lmms_eval \\
+        --model ollama \\
+        --model_args model_version=llava \\
+        --tasks mme --limit 8
+"""
+
+from __future__ import annotations
+
+import math
+import time
+from typing import Any, List, Optional, Tuple
+
+import requests as http_requests
+from loguru import logger as eval_logger
+from tqdm import tqdm
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.registry import register_model
+from lmms_eval.models.chat.openai import OpenAICompatible as OpenAICompatibleChatBase
+
+_OLLAMA_DEFAULT_BASE_URL = "http://localhost:11434/v1"
+_OLLAMA_NO_KEY = "ollama"
+
+
+@register_model("ollama")
+class Ollama(OpenAICompatibleChatBase):
+    """Ollama local inference backend (OpenAI-compatible /v1 API)."""
+
+    is_simple = False
+
+    def __init__(
+        self,
+        model_version: str = "llava",
+        model: Optional[str] = None,
+        host: str = _OLLAMA_DEFAULT_BASE_URL,
+        base_url: Optional[str] = None,
+        api_key: str = _OLLAMA_NO_KEY,
+        num_concurrent: int = 4,
+        **kwargs: Any,
+    ) -> None:
+        resolved_base_url = base_url or host
+        # Derive the Ollama native API root (without /v1) for loglikelihood calls.
+        self._ollama_api_base = resolved_base_url.rstrip("/").removesuffix("/v1")
+        super().__init__(
+            model_version=model_version,
+            model=model,
+            base_url=resolved_base_url,
+            api_key=api_key,
+            num_concurrent=num_concurrent,
+            azure_openai=False,
+            **kwargs,
+        )
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        """Compute log-likelihood of the continuation given the context.
+
+        Uses Ollama's native ``POST /api/generate`` with ``logprobs=True`` so
+        we get per-token log-probs for the full prompt+continuation, then
+        subtract the prompt-only log-prob to isolate the continuation score.
+        """
+        results: List[Tuple[float, bool]] = []
+        url = f"{self._ollama_api_base}/api/generate"
+
+        for instance in tqdm(requests, disable=(self.rank != 0), desc="Loglikelihood"):
+            context, continuation = instance.args[0], instance.args[1]
+            full_text = context + continuation
+
+            def _get_logprob(prompt: str) -> float:
+                payload = {
+                    "model": self.model_version,
+                    "prompt": prompt,
+                    "stream": False,
+                    "logprobs": True,
+                    "options": {"temperature": 0},
+                }
+                for attempt in range(self.max_retries):
+                    try:
+                        resp = http_requests.post(url, json=payload, timeout=self.timeout * 6)
+                        resp.raise_for_status()
+                        data = resp.json()
+                        token_lps = data.get("logprobs") or []
+                        return float(sum(token_lps))
+                    except Exception as exc:
+                        eval_logger.info(f"loglikelihood attempt {attempt + 1}/{self.max_retries} failed: {exc}")
+                        if attempt < self.max_retries - 1:
+                            time.sleep(self.retry_backoff_s)
+                return -math.inf
+
+            full_lp = _get_logprob(full_text)
+            if math.isinf(full_lp):
+                results.append((-math.inf, False))
+                continue
+            ctx_lp = _get_logprob(context) if context else 0.0
+            continuation_lp = full_lp - ctx_lp
+            is_greedy = continuation_lp >= 0.0
+            results.append((continuation_lp, is_greedy))
+
+        return results
diff --git a/test/models/test_ollama.py b/test/models/test_ollama.py
new file mode 100644
index 000000000..3b4ceb8c2
--- /dev/null
+++ b/test/models/test_ollama.py
@@ -0,0 +1,192 @@
+"""Unit tests for the Ollama backend."""
+
+from __future__ import annotations
+
+import math
+import sys
+import types
+import unittest
+from types import SimpleNamespace
+from unittest import mock
+
+
+def _ensure_decord_stub() -> None:
+    """Register a fake decord module so optional_import resolves without the package."""
+    if "decord" not in sys.modules:
+        mod = types.ModuleType("decord")
+        mod.VideoReader = mock.MagicMock()
+        mod.cpu = mock.MagicMock()
+        sys.modules["decord"] = mod
+
+
+def _fake_accelerator() -> mock.MagicMock:
+    acc = mock.MagicMock()
+    acc.num_processes = 1
+    acc.local_process_index = 0
+    acc.device = "cpu"
+    return acc
+
+
+def _make_ollama(model_version: str = "llava", **kwargs):
+    _ensure_decord_stub()
+    from lmms_eval.models.chat.ollama import Ollama
+
+    with mock.patch("lmms_eval.models.simple.openai.OpenAI"), mock.patch("lmms_eval.models.simple.openai.Accelerator", return_value=_fake_accelerator()):
+        return Ollama(model_version=model_version, **kwargs)
+
+
+def _make_post_mock(responses: list[dict]) -> mock.MagicMock:
+    """MagicMock for http_requests.post returning each response dict in sequence."""
+    side_effects = []
+    for resp in responses:
+        r = mock.MagicMock()
+        r.raise_for_status = mock.MagicMock()
+        r.json.return_value = resp
+        side_effects.append(r)
+    return mock.MagicMock(side_effect=side_effects)
+
+
+def _logprob_response(logprobs: list[float]) -> dict:
+    return {"logprobs": logprobs, "response": ""}
+
+
+def _make_instance(context: str, continuation: str) -> SimpleNamespace:
+    return SimpleNamespace(args=(context, continuation), rank=0)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestOllamaRegistration(unittest.TestCase):
+    def test_registered_as_chat_model(self) -> None:
+        from lmms_eval.models import MODEL_REGISTRY_V2
+
+        manifest = MODEL_REGISTRY_V2.get_manifest("ollama")
+        self.assertEqual(manifest.model_id, "ollama")
+        self.assertEqual(manifest.chat_class_path, "lmms_eval.models.chat.ollama.Ollama")
+        self.assertIsNone(manifest.simple_class_path)
+
+    def test_is_simple_false(self) -> None:
+        _ensure_decord_stub()
+        from lmms_eval.models.chat.ollama import Ollama
+
+        self.assertFalse(Ollama.is_simple)
+
+
+class TestOllamaInit(unittest.TestCase):
+    def test_default_base_url(self) -> None:
+        m = _make_ollama()
+        self.assertEqual(m._ollama_api_base, "http://localhost:11434")
+
+    def test_custom_host_strips_v1(self) -> None:
+        m = _make_ollama(host="http://myserver:11434/v1")
+        self.assertEqual(m._ollama_api_base, "http://myserver:11434")
+
+    def test_model_version_stored(self) -> None:
+        m = _make_ollama(model_version="mistral")
+        self.assertEqual(m.model_version, "mistral")
+
+    def test_num_concurrent_default(self) -> None:
+        m = _make_ollama()
+        self.assertEqual(m.num_concurrent, 4)
+
+
+class TestOllamaLoglikelihood(unittest.TestCase):
+    def test_loglikelihood_sums_continuation_tokens(self) -> None:
+        model = _make_ollama()
+        # full (ctx+cont) = -2.0, context-only = -1.0  →  continuation = -1.0
+        post_mock = _make_post_mock(
+            [
+                _logprob_response([-1.0, -0.5, -0.5]),
+                _logprob_response([-0.8, -0.2]),
+            ]
+        )
+        instance = _make_instance("The sky is", " blue")
+        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
+            results = model.loglikelihood([instance])
+
+        lp, is_greedy = results[0]
+        self.assertAlmostEqual(lp, -1.0, places=5)
+        self.assertFalse(is_greedy)
+
+    def test_loglikelihood_payload_fields(self) -> None:
+        """Verify the POST payload contains the required Ollama API fields."""
+        model = _make_ollama(model_version="llava")
+        post_mock = _make_post_mock(
+            [
+                _logprob_response([-1.0]),
+                _logprob_response([-0.5]),
+            ]
+        )
+        instance = _make_instance("ctx", " cont")
+        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
+            model.loglikelihood([instance])
+
+        first_call_kwargs = post_mock.call_args_list[0].kwargs
+        payload = first_call_kwargs["json"]
+        self.assertEqual(payload["model"], "llava")
+        self.assertTrue(payload["logprobs"])
+        self.assertFalse(payload["stream"])
+        self.assertEqual(payload["options"]["temperature"], 0)
+        self.assertIn("/api/generate", post_mock.call_args_list[0].args[0])
+
+    def test_loglikelihood_correct_prompts_sent(self) -> None:
+        """First call gets full text, second gets context only."""
+        model = _make_ollama()
+        post_mock = _make_post_mock(
+            [
+                _logprob_response([-1.0]),
+                _logprob_response([-0.5]),
+            ]
+        )
+        instance = _make_instance("The sky is", " blue")
+        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
+            model.loglikelihood([instance])
+
+        self.assertEqual(post_mock.call_count, 2)
+        self.assertEqual(post_mock.call_args_list[0].kwargs["json"]["prompt"], "The sky is blue")
+        self.assertEqual(post_mock.call_args_list[1].kwargs["json"]["prompt"], "The sky is")
+
+    def test_loglikelihood_empty_context_skips_second_call(self) -> None:
+        model = _make_ollama()
+        post_mock = _make_post_mock([_logprob_response([-0.5, -0.5])])
+        instance = _make_instance("", "hello")
+        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
+            results = model.loglikelihood([instance])
+
+        self.assertEqual(post_mock.call_count, 1)
+        lp, _ = results[0]
+        self.assertAlmostEqual(lp, -1.0, places=5)
+
+    def test_loglikelihood_returns_neg_inf_on_failure(self) -> None:
+        model = _make_ollama()
+        model.max_retries = 2
+        model.retry_backoff_s = 0.0
+
+        instance = _make_instance("ctx", " cont")
+        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", side_effect=Exception("conn refused")):
+            results = model.loglikelihood([instance])
+
+        lp, _ = results[0]
+        self.assertEqual(lp, -math.inf)
+
+    def test_loglikelihood_positive_score_is_greedy(self) -> None:
+        model = _make_ollama()
+        post_mock = _make_post_mock(
+            [
+                _logprob_response([0.1, 0.2]),
+                _logprob_response([-0.5]),
+            ]
+        )
+        instance = _make_instance("ctx", " cont")
+        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
+            results = model.loglikelihood([instance])
+
+        _, is_greedy = results[0]
+        self.assertTrue(is_greedy)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5d3cbfe4ccdbab05c930bc48d83f64a91fbf6ea7 Mon Sep 17 00:00:00 2001
From: elias <eliasls2002@yahoo.de>
Date: Tue, 5 May 2026 20:39:18 +0200
Subject: [PATCH 2/3] PR ready

---
 lmms_eval/models/chat/ollama.py |  59 +++---------------
 test/models/test_ollama.py      | 106 +-------------------------------
 2 files changed, 10 insertions(+), 155 deletions(-)

diff --git a/lmms_eval/models/chat/ollama.py b/lmms_eval/models/chat/ollama.py
index f2f09f24b..ae8ef86d3 100644
--- a/lmms_eval/models/chat/ollama.py
+++ b/lmms_eval/models/chat/ollama.py
@@ -1,9 +1,7 @@
 """Ollama chat backend.
 
 Ollama exposes an OpenAI-compatible API at http://localhost:11434/v1, so this
-backend inherits the full OpenAI chat implementation and overrides only
-loglikelihood, which Ollama supports via the /api/generate endpoint with
-``logprobs=True``.
+backend inherits the OpenAI chat implementation for generation.
 
 Example usage::
 
@@ -15,14 +13,8 @@
 
 from __future__ import annotations
 
-import math
-import time
 from typing import Any, List, Optional, Tuple
 
-import requests as http_requests
-from loguru import logger as eval_logger
-from tqdm import tqdm
-
 from lmms_eval.api.instance import Instance
 from lmms_eval.api.registry import register_model
 from lmms_eval.models.chat.openai import OpenAICompatible as OpenAICompatibleChatBase
@@ -61,47 +53,12 @@ def __init__(
         )
 
     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
-        """Compute log-likelihood of the continuation given the context.
+        """Ollama does not expose prompt-token log-likelihoods.
 
-        Uses Ollama's native ``POST /api/generate`` with ``logprobs=True`` so
-        we get per-token log-probs for the full prompt+continuation, then
-        subtract the prompt-only log-prob to isolate the continuation score.
+        Ollama's native ``POST /api/generate`` can return ``logprobs`` for
+        generated tokens, but lmms-eval's loglikelihood API needs the
+        likelihood of a provided continuation under a fixed context. Returning
+        a fabricated score would make multiple-choice likelihood tasks look
+        valid while producing misleading metrics.
         """
-        results: List[Tuple[float, bool]] = []
-        url = f"{self._ollama_api_base}/api/generate"
-
-        for instance in tqdm(requests, disable=(self.rank != 0), desc="Loglikelihood"):
-            context, continuation = instance.args[0], instance.args[1]
-            full_text = context + continuation
-
-            def _get_logprob(prompt: str) -> float:
-                payload = {
-                    "model": self.model_version,
-                    "prompt": prompt,
-                    "stream": False,
-                    "logprobs": True,
-                    "options": {"temperature": 0},
-                }
-                for attempt in range(self.max_retries):
-                    try:
-                        resp = http_requests.post(url, json=payload, timeout=self.timeout * 6)
-                        resp.raise_for_status()
-                        data = resp.json()
-                        token_lps = data.get("logprobs") or []
-                        return float(sum(token_lps))
-                    except Exception as exc:
-                        eval_logger.info(f"loglikelihood attempt {attempt + 1}/{self.max_retries} failed: {exc}")
-                        if attempt < self.max_retries - 1:
-                            time.sleep(self.retry_backoff_s)
-                return -math.inf
-
-            full_lp = _get_logprob(full_text)
-            if math.isinf(full_lp):
-                results.append((-math.inf, False))
-                continue
-            ctx_lp = _get_logprob(context) if context else 0.0
-            continuation_lp = full_lp - ctx_lp
-            is_greedy = continuation_lp >= 0.0
-            results.append((continuation_lp, is_greedy))
-
-        return results
+        raise NotImplementedError("Ollama loglikelihood is not supported; use generate_until tasks instead.")
diff --git a/test/models/test_ollama.py b/test/models/test_ollama.py
index 3b4ceb8c2..c20d39ea1 100644
--- a/test/models/test_ollama.py
+++ b/test/models/test_ollama.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import math
 import sys
 import types
 import unittest
@@ -35,21 +34,6 @@ def _make_ollama(model_version: str = "llava", **kwargs):
         return Ollama(model_version=model_version, **kwargs)
 
 
-def _make_post_mock(responses: list[dict]) -> mock.MagicMock:
-    """MagicMock for http_requests.post returning each response dict in sequence."""
-    side_effects = []
-    for resp in responses:
-        r = mock.MagicMock()
-        r.raise_for_status = mock.MagicMock()
-        r.json.return_value = resp
-        side_effects.append(r)
-    return mock.MagicMock(side_effect=side_effects)
-
-
-def _logprob_response(logprobs: list[float]) -> dict:
-    return {"logprobs": logprobs, "response": ""}
-
-
 def _make_instance(context: str, continuation: str) -> SimpleNamespace:
     return SimpleNamespace(args=(context, continuation), rank=0)
 
@@ -94,99 +78,13 @@ def test_num_concurrent_default(self) -> None:
 
 
 class TestOllamaLoglikelihood(unittest.TestCase):
-    def test_loglikelihood_sums_continuation_tokens(self) -> None:
+    def test_loglikelihood_is_explicitly_unsupported(self) -> None:
         model = _make_ollama()
-        # full (ctx+cont) = -2.0, context-only = -1.0  →  continuation = -1.0
-        post_mock = _make_post_mock(
-            [
-                _logprob_response([-1.0, -0.5, -0.5]),
-                _logprob_response([-0.8, -0.2]),
-            ]
-        )
         instance = _make_instance("The sky is", " blue")
-        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
-            results = model.loglikelihood([instance])
-
-        lp, is_greedy = results[0]
-        self.assertAlmostEqual(lp, -1.0, places=5)
-        self.assertFalse(is_greedy)
-
-    def test_loglikelihood_payload_fields(self) -> None:
-        """Verify the POST payload contains the required Ollama API fields."""
-        model = _make_ollama(model_version="llava")
-        post_mock = _make_post_mock(
-            [
-                _logprob_response([-1.0]),
-                _logprob_response([-0.5]),
-            ]
-        )
-        instance = _make_instance("ctx", " cont")
-        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
-            model.loglikelihood([instance])
 
-        first_call_kwargs = post_mock.call_args_list[0].kwargs
-        payload = first_call_kwargs["json"]
-        self.assertEqual(payload["model"], "llava")
-        self.assertTrue(payload["logprobs"])
-        self.assertFalse(payload["stream"])
-        self.assertEqual(payload["options"]["temperature"], 0)
-        self.assertIn("/api/generate", post_mock.call_args_list[0].args[0])
-
-    def test_loglikelihood_correct_prompts_sent(self) -> None:
-        """First call gets full text, second gets context only."""
-        model = _make_ollama()
-        post_mock = _make_post_mock(
-            [
-                _logprob_response([-1.0]),
-                _logprob_response([-0.5]),
-            ]
-        )
-        instance = _make_instance("The sky is", " blue")
-        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
+        with self.assertRaisesRegex(NotImplementedError, "generate_until"):
             model.loglikelihood([instance])
 
-        self.assertEqual(post_mock.call_count, 2)
-        self.assertEqual(post_mock.call_args_list[0].kwargs["json"]["prompt"], "The sky is blue")
-        self.assertEqual(post_mock.call_args_list[1].kwargs["json"]["prompt"], "The sky is")
-
-    def test_loglikelihood_empty_context_skips_second_call(self) -> None:
-        model = _make_ollama()
-        post_mock = _make_post_mock([_logprob_response([-0.5, -0.5])])
-        instance = _make_instance("", "hello")
-        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
-            results = model.loglikelihood([instance])
-
-        self.assertEqual(post_mock.call_count, 1)
-        lp, _ = results[0]
-        self.assertAlmostEqual(lp, -1.0, places=5)
-
-    def test_loglikelihood_returns_neg_inf_on_failure(self) -> None:
-        model = _make_ollama()
-        model.max_retries = 2
-        model.retry_backoff_s = 0.0
-
-        instance = _make_instance("ctx", " cont")
-        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", side_effect=Exception("conn refused")):
-            results = model.loglikelihood([instance])
-
-        lp, _ = results[0]
-        self.assertEqual(lp, -math.inf)
-
-    def test_loglikelihood_positive_score_is_greedy(self) -> None:
-        model = _make_ollama()
-        post_mock = _make_post_mock(
-            [
-                _logprob_response([0.1, 0.2]),
-                _logprob_response([-0.5]),
-            ]
-        )
-        instance = _make_instance("ctx", " cont")
-        with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock):
-            results = model.loglikelihood([instance])
-
-        _, is_greedy = results[0]
-        self.assertTrue(is_greedy)
-
 
 if __name__ == "__main__":
     unittest.main()

From 6e17eaba28c543d932404a2b79493989e5c83aa1 Mon Sep 17 00:00:00 2001
From: elias <eliasls2002@yahoo.de>
Date: Tue, 5 May 2026 20:42:42 +0200
Subject: [PATCH 3/3] deleting  notes

---
 OLLAMA_NEXT_STEPS.md | 128 -------------------------------------------
 1 file changed, 128 deletions(-)
 delete mode 100644 OLLAMA_NEXT_STEPS.md

diff --git a/OLLAMA_NEXT_STEPS.md b/OLLAMA_NEXT_STEPS.md
deleted file mode 100644
index 7768bcc61..000000000
--- a/OLLAMA_NEXT_STEPS.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Ollama Backend — Next Steps
-
-## Where we left off
-
-Branch: `feat/ollama-model`
-
-Two files were added:
-
-- `lmms_eval/models/chat/ollama.py` — the backend
-- `test/models/test_ollama.py` — unit tests (10/10 passing, no Ollama required)
-
-One line was added to `lmms_eval/models/__init__.py`:
-```python
-AVAILABLE_CHAT_TEMPLATE_MODELS = {
-    "ollama": "Ollama",   # ← added
-    ...
-}
-```
-
----
-
-## What still needs to be done
-
-### 1. Install Ollama and pull a vision model
-
-```bash
-# Install from https://ollama.com
-ollama serve                  # start the server (runs on localhost:11434)
-ollama pull llava             # smallest vision model, good for smoke testing
-ollama pull llava-llama3      # better quality if you have the VRAM
-```
-
-### 2. Run a live smoke test
-
-```bash
-uv run python -m lmms_eval \
-    --model ollama \
-    --model_args model_version=llava \
-    --tasks mme \
-    --limit 8
-```
-
-Expected: 8 samples evaluated, scores printed. If it errors, check:
-- Is `ollama serve` running?
-- Does `ollama list` show `llava`?
-- Is the `logprobs` field actually present in `/api/generate` responses for your Ollama version?
-
-### 3. Verify loglikelihood against the real API
-
-The `loglikelihood` implementation uses `POST /api/generate` with `logprobs=True`.
-This field was added in Ollama v0.1.38. Confirm it works:
-
-```bash
-curl http://localhost:11434/api/generate -d '{
-  "model": "llava",
-  "prompt": "The sky is blue",
-  "stream": false,
-  "logprobs": true
-}'
-```
-
-The response should contain a `"logprobs"` array of floats. If it's missing or null,
-the implementation will silently return `-inf` for all loglikelihood requests — tasks
-that depend on it (e.g. multiple-choice scoring) will produce wrong results.
-
-### 4. Run pre-commit linting (required before PR)
-
-```bash
-uv run pip install pre-commit
-uv run pre-commit install
-uv run pre-commit run --all-files
-```
-
-This runs Black (line length 240) + isort. Fix any formatting issues it flags.
-
-### 5. Commit
-
-```bash
-git add lmms_eval/models/chat/ollama.py \
-        lmms_eval/models/__init__.py \
-        test/models/test_ollama.py
-git commit -m "feat: add Ollama local inference backend"
-```
-
-### 6. Open a pull request
-
-The upstream repo is `EvolvingLMMs-Lab/lmms-eval`. You'll need to fork it if you
-haven't already, push the branch, and open a PR against `main`.
-
-PR description should include:
-- What model this adds and why (local inference, no API key, multimodal)
-- Supported Ollama models (llava, llava-llama3, moondream, minicpm-v, ...)
-- Known limitations (no video/audio, logprobs requires Ollama ≥ v0.1.38)
-- Example usage command
-
----
-
-## Known limitations to mention in the PR
-
-| Feature | Status |
-|---|---|
-| Image input | Works (base64 via OpenAI-compat /v1) |
-| Text-only models | Works |
-| Video input | Not supported by Ollama |
-| Audio input | Not supported by Ollama |
-| `loglikelihood` | Works if Ollama ≥ v0.1.38 and model supports it |
-| `generate_until_multi_round` | Inherited from OpenAI parent (raises NotImplementedError) |
-
----
-
-## Quick reference
-
-```bash
-# Set up on new machine
-git clone <your-fork>
-cd lmms-eval
-git checkout feat/ollama-model
-uv sync
-
-# Run unit tests (no Ollama needed)
-uv run python -m pytest test/models/test_ollama.py -v
-
-# Run live eval (Ollama must be running)
-uv run python -m lmms_eval \
-    --model ollama \
-    --model_args model_version=llava \
-    --tasks mme --limit 8
-```