From 261e1f8f08661eae70dc3737c6d49cd4e3c17e84 Mon Sep 17 00:00:00 2001 From: Elias Date: Tue, 5 May 2026 18:39:47 +0200 Subject: [PATCH 1/3] preparing for PR --- OLLAMA_NEXT_STEPS.md | 128 +++++++++++++++++++++ lmms_eval/models/__init__.py | 1 + lmms_eval/models/chat/ollama.py | 107 ++++++++++++++++++ test/models/test_ollama.py | 192 ++++++++++++++++++++++++++++++++ 4 files changed, 428 insertions(+) create mode 100644 OLLAMA_NEXT_STEPS.md create mode 100644 lmms_eval/models/chat/ollama.py create mode 100644 test/models/test_ollama.py diff --git a/OLLAMA_NEXT_STEPS.md b/OLLAMA_NEXT_STEPS.md new file mode 100644 index 000000000..7768bcc61 --- /dev/null +++ b/OLLAMA_NEXT_STEPS.md @@ -0,0 +1,128 @@ +# Ollama Backend — Next Steps + +## Where we left off + +Branch: `feat/ollama-model` + +Two files were added: + +- `lmms_eval/models/chat/ollama.py` — the backend +- `test/models/test_ollama.py` — unit tests (10/10 passing, no Ollama required) + +One line was added to `lmms_eval/models/__init__.py`: +```python +AVAILABLE_CHAT_TEMPLATE_MODELS = { + "ollama": "Ollama", # ← added + ... +} +``` + +--- + +## What still needs to be done + +### 1. Install Ollama and pull a vision model + +```bash +# Install from https://ollama.com +ollama serve # start the server (runs on localhost:11434) +ollama pull llava # smallest vision model, good for smoke testing +ollama pull llava-llama3 # better quality if you have the VRAM +``` + +### 2. Run a live smoke test + +```bash +uv run python -m lmms_eval \ + --model ollama \ + --model_args model_version=llava \ + --tasks mme \ + --limit 8 +``` + +Expected: 8 samples evaluated, scores printed. If it errors, check: +- Is `ollama serve` running? +- Does `ollama list` show `llava`? +- Is the `logprobs` field actually present in `/api/generate` responses for your Ollama version? + +### 3. Verify loglikelihood against the real API + +The `loglikelihood` implementation uses `POST /api/generate` with `logprobs=True`. +This field was added in Ollama v0.1.38. Confirm it works: + +```bash +curl http://localhost:11434/api/generate -d '{ + "model": "llava", + "prompt": "The sky is blue", + "stream": false, + "logprobs": true +}' +``` + +The response should contain a `"logprobs"` array of floats. If it's missing or null, +the implementation will silently return `-inf` for all loglikelihood requests — tasks +that depend on it (e.g. multiple-choice scoring) will produce wrong results. + +### 4. Run pre-commit linting (required before PR) + +```bash +uv run pip install pre-commit +uv run pre-commit install +uv run pre-commit run --all-files +``` + +This runs Black (line length 240) + isort. Fix any formatting issues it flags. + +### 5. Commit + +```bash +git add lmms_eval/models/chat/ollama.py \ + lmms_eval/models/__init__.py \ + test/models/test_ollama.py +git commit -m "feat: add Ollama local inference backend" +``` + +### 6. Open a pull request + +The upstream repo is `EvolvingLMMs-Lab/lmms-eval`. You'll need to fork it if you +haven't already, push the branch, and open a PR against `main`. + +PR description should include: +- What model this adds and why (local inference, no API key, multimodal) +- Supported Ollama models (llava, llava-llama3, moondream, minicpm-v, ...) +- Known limitations (no video/audio, logprobs requires Ollama ≥ v0.1.38) +- Example usage command + +--- + +## Known limitations to mention in the PR + +| Feature | Status | +|---|---| +| Image input | Works (base64 via OpenAI-compat /v1) | +| Text-only models | Works | +| Video input | Not supported by Ollama | +| Audio input | Not supported by Ollama | +| `loglikelihood` | Works if Ollama ≥ v0.1.38 and model supports it | +| `generate_until_multi_round` | Inherited from OpenAI parent (raises NotImplementedError) | + +--- + +## Quick reference + +```bash +# Set up on new machine +git clone +cd lmms-eval +git checkout feat/ollama-model +uv sync + +# Run unit tests (no Ollama needed) +uv run python -m pytest test/models/test_ollama.py -v + +# Run live eval (Ollama must be running) +uv run python -m lmms_eval \ + --model ollama \ + --model_args model_version=llava \ + --tasks mme --limit 8 +``` diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index a7c1f92e5..25d71a3b4 100644 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -114,6 +114,7 @@ AVAILABLE_CHAT_TEMPLATE_MODELS = { "bagel_lmms_engine": "BagelLmmsEngine", + "ollama": "Ollama", "fastvideo": "FastVideo", "internvl_hf": "InternVLHf", "llava_hf": "LlavaHf", diff --git a/lmms_eval/models/chat/ollama.py b/lmms_eval/models/chat/ollama.py new file mode 100644 index 000000000..f2f09f24b --- /dev/null +++ b/lmms_eval/models/chat/ollama.py @@ -0,0 +1,107 @@ +"""Ollama chat backend. + +Ollama exposes an OpenAI-compatible API at http://localhost:11434/v1, so this +backend inherits the full OpenAI chat implementation and overrides only +loglikelihood, which Ollama supports via the /api/generate endpoint with +``logprobs=True``. + +Example usage:: + + python -m lmms_eval \\ + --model ollama \\ + --model_args model_version=llava \\ + --tasks mme --limit 8 +""" + +from __future__ import annotations + +import math +import time +from typing import Any, List, Optional, Tuple + +import requests as http_requests +from loguru import logger as eval_logger +from tqdm import tqdm + +from lmms_eval.api.instance import Instance +from lmms_eval.api.registry import register_model +from lmms_eval.models.chat.openai import OpenAICompatible as OpenAICompatibleChatBase + +_OLLAMA_DEFAULT_BASE_URL = "http://localhost:11434/v1" +_OLLAMA_NO_KEY = "ollama" + + +@register_model("ollama") +class Ollama(OpenAICompatibleChatBase): + """Ollama local inference backend (OpenAI-compatible /v1 API).""" + + is_simple = False + + def __init__( + self, + model_version: str = "llava", + model: Optional[str] = None, + host: str = _OLLAMA_DEFAULT_BASE_URL, + base_url: Optional[str] = None, + api_key: str = _OLLAMA_NO_KEY, + num_concurrent: int = 4, + **kwargs: Any, + ) -> None: + resolved_base_url = base_url or host + # Derive the Ollama native API root (without /v1) for loglikelihood calls. + self._ollama_api_base = resolved_base_url.rstrip("/").removesuffix("/v1") + super().__init__( + model_version=model_version, + model=model, + base_url=resolved_base_url, + api_key=api_key, + num_concurrent=num_concurrent, + azure_openai=False, + **kwargs, + ) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + """Compute log-likelihood of the continuation given the context. + + Uses Ollama's native ``POST /api/generate`` with ``logprobs=True`` so + we get per-token log-probs for the full prompt+continuation, then + subtract the prompt-only log-prob to isolate the continuation score. + """ + results: List[Tuple[float, bool]] = [] + url = f"{self._ollama_api_base}/api/generate" + + for instance in tqdm(requests, disable=(self.rank != 0), desc="Loglikelihood"): + context, continuation = instance.args[0], instance.args[1] + full_text = context + continuation + + def _get_logprob(prompt: str) -> float: + payload = { + "model": self.model_version, + "prompt": prompt, + "stream": False, + "logprobs": True, + "options": {"temperature": 0}, + } + for attempt in range(self.max_retries): + try: + resp = http_requests.post(url, json=payload, timeout=self.timeout * 6) + resp.raise_for_status() + data = resp.json() + token_lps = data.get("logprobs") or [] + return float(sum(token_lps)) + except Exception as exc: + eval_logger.info(f"loglikelihood attempt {attempt + 1}/{self.max_retries} failed: {exc}") + if attempt < self.max_retries - 1: + time.sleep(self.retry_backoff_s) + return -math.inf + + full_lp = _get_logprob(full_text) + if math.isinf(full_lp): + results.append((-math.inf, False)) + continue + ctx_lp = _get_logprob(context) if context else 0.0 + continuation_lp = full_lp - ctx_lp + is_greedy = continuation_lp >= 0.0 + results.append((continuation_lp, is_greedy)) + + return results diff --git a/test/models/test_ollama.py b/test/models/test_ollama.py new file mode 100644 index 000000000..3b4ceb8c2 --- /dev/null +++ b/test/models/test_ollama.py @@ -0,0 +1,192 @@ +"""Unit tests for the Ollama backend.""" + +from __future__ import annotations + +import math +import sys +import types +import unittest +from types import SimpleNamespace +from unittest import mock + + +def _ensure_decord_stub() -> None: + """Register a fake decord module so optional_import resolves without the package.""" + if "decord" not in sys.modules: + mod = types.ModuleType("decord") + mod.VideoReader = mock.MagicMock() + mod.cpu = mock.MagicMock() + sys.modules["decord"] = mod + + +def _fake_accelerator() -> mock.MagicMock: + acc = mock.MagicMock() + acc.num_processes = 1 + acc.local_process_index = 0 + acc.device = "cpu" + return acc + + +def _make_ollama(model_version: str = "llava", **kwargs): + _ensure_decord_stub() + from lmms_eval.models.chat.ollama import Ollama + + with mock.patch("lmms_eval.models.simple.openai.OpenAI"), mock.patch("lmms_eval.models.simple.openai.Accelerator", return_value=_fake_accelerator()): + return Ollama(model_version=model_version, **kwargs) + + +def _make_post_mock(responses: list[dict]) -> mock.MagicMock: + """MagicMock for http_requests.post returning each response dict in sequence.""" + side_effects = [] + for resp in responses: + r = mock.MagicMock() + r.raise_for_status = mock.MagicMock() + r.json.return_value = resp + side_effects.append(r) + return mock.MagicMock(side_effect=side_effects) + + +def _logprob_response(logprobs: list[float]) -> dict: + return {"logprobs": logprobs, "response": ""} + + +def _make_instance(context: str, continuation: str) -> SimpleNamespace: + return SimpleNamespace(args=(context, continuation), rank=0) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestOllamaRegistration(unittest.TestCase): + def test_registered_as_chat_model(self) -> None: + from lmms_eval.models import MODEL_REGISTRY_V2 + + manifest = MODEL_REGISTRY_V2.get_manifest("ollama") + self.assertEqual(manifest.model_id, "ollama") + self.assertEqual(manifest.chat_class_path, "lmms_eval.models.chat.ollama.Ollama") + self.assertIsNone(manifest.simple_class_path) + + def test_is_simple_false(self) -> None: + _ensure_decord_stub() + from lmms_eval.models.chat.ollama import Ollama + + self.assertFalse(Ollama.is_simple) + + +class TestOllamaInit(unittest.TestCase): + def test_default_base_url(self) -> None: + m = _make_ollama() + self.assertEqual(m._ollama_api_base, "http://localhost:11434") + + def test_custom_host_strips_v1(self) -> None: + m = _make_ollama(host="http://myserver:11434/v1") + self.assertEqual(m._ollama_api_base, "http://myserver:11434") + + def test_model_version_stored(self) -> None: + m = _make_ollama(model_version="mistral") + self.assertEqual(m.model_version, "mistral") + + def test_num_concurrent_default(self) -> None: + m = _make_ollama() + self.assertEqual(m.num_concurrent, 4) + + +class TestOllamaLoglikelihood(unittest.TestCase): + def test_loglikelihood_sums_continuation_tokens(self) -> None: + model = _make_ollama() + # full (ctx+cont) = -2.0, context-only = -1.0 → continuation = -1.0 + post_mock = _make_post_mock( + [ + _logprob_response([-1.0, -0.5, -0.5]), + _logprob_response([-0.8, -0.2]), + ] + ) + instance = _make_instance("The sky is", " blue") + with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): + results = model.loglikelihood([instance]) + + lp, is_greedy = results[0] + self.assertAlmostEqual(lp, -1.0, places=5) + self.assertFalse(is_greedy) + + def test_loglikelihood_payload_fields(self) -> None: + """Verify the POST payload contains the required Ollama API fields.""" + model = _make_ollama(model_version="llava") + post_mock = _make_post_mock( + [ + _logprob_response([-1.0]), + _logprob_response([-0.5]), + ] + ) + instance = _make_instance("ctx", " cont") + with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): + model.loglikelihood([instance]) + + first_call_kwargs = post_mock.call_args_list[0].kwargs + payload = first_call_kwargs["json"] + self.assertEqual(payload["model"], "llava") + self.assertTrue(payload["logprobs"]) + self.assertFalse(payload["stream"]) + self.assertEqual(payload["options"]["temperature"], 0) + self.assertIn("/api/generate", post_mock.call_args_list[0].args[0]) + + def test_loglikelihood_correct_prompts_sent(self) -> None: + """First call gets full text, second gets context only.""" + model = _make_ollama() + post_mock = _make_post_mock( + [ + _logprob_response([-1.0]), + _logprob_response([-0.5]), + ] + ) + instance = _make_instance("The sky is", " blue") + with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): + model.loglikelihood([instance]) + + self.assertEqual(post_mock.call_count, 2) + self.assertEqual(post_mock.call_args_list[0].kwargs["json"]["prompt"], "The sky is blue") + self.assertEqual(post_mock.call_args_list[1].kwargs["json"]["prompt"], "The sky is") + + def test_loglikelihood_empty_context_skips_second_call(self) -> None: + model = _make_ollama() + post_mock = _make_post_mock([_logprob_response([-0.5, -0.5])]) + instance = _make_instance("", "hello") + with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): + results = model.loglikelihood([instance]) + + self.assertEqual(post_mock.call_count, 1) + lp, _ = results[0] + self.assertAlmostEqual(lp, -1.0, places=5) + + def test_loglikelihood_returns_neg_inf_on_failure(self) -> None: + model = _make_ollama() + model.max_retries = 2 + model.retry_backoff_s = 0.0 + + instance = _make_instance("ctx", " cont") + with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", side_effect=Exception("conn refused")): + results = model.loglikelihood([instance]) + + lp, _ = results[0] + self.assertEqual(lp, -math.inf) + + def test_loglikelihood_positive_score_is_greedy(self) -> None: + model = _make_ollama() + post_mock = _make_post_mock( + [ + _logprob_response([0.1, 0.2]), + _logprob_response([-0.5]), + ] + ) + instance = _make_instance("ctx", " cont") + with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): + results = model.loglikelihood([instance]) + + _, is_greedy = results[0] + self.assertTrue(is_greedy) + + +if __name__ == "__main__": + unittest.main() From 5d3cbfe4ccdbab05c930bc48d83f64a91fbf6ea7 Mon Sep 17 00:00:00 2001 From: elias Date: Tue, 5 May 2026 20:39:18 +0200 Subject: [PATCH 2/3] PR ready --- lmms_eval/models/chat/ollama.py | 59 +++--------------- test/models/test_ollama.py | 106 +------------------------------- 2 files changed, 10 insertions(+), 155 deletions(-) diff --git a/lmms_eval/models/chat/ollama.py b/lmms_eval/models/chat/ollama.py index f2f09f24b..ae8ef86d3 100644 --- a/lmms_eval/models/chat/ollama.py +++ b/lmms_eval/models/chat/ollama.py @@ -1,9 +1,7 @@ """Ollama chat backend. Ollama exposes an OpenAI-compatible API at http://localhost:11434/v1, so this -backend inherits the full OpenAI chat implementation and overrides only -loglikelihood, which Ollama supports via the /api/generate endpoint with -``logprobs=True``. +backend inherits the OpenAI chat implementation for generation. Example usage:: @@ -15,14 +13,8 @@ from __future__ import annotations -import math -import time from typing import Any, List, Optional, Tuple -import requests as http_requests -from loguru import logger as eval_logger -from tqdm import tqdm - from lmms_eval.api.instance import Instance from lmms_eval.api.registry import register_model from lmms_eval.models.chat.openai import OpenAICompatible as OpenAICompatibleChatBase @@ -61,47 +53,12 @@ def __init__( ) def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: - """Compute log-likelihood of the continuation given the context. + """Ollama does not expose prompt-token log-likelihoods. - Uses Ollama's native ``POST /api/generate`` with ``logprobs=True`` so - we get per-token log-probs for the full prompt+continuation, then - subtract the prompt-only log-prob to isolate the continuation score. + Ollama's native ``POST /api/generate`` can return ``logprobs`` for + generated tokens, but lmms-eval's loglikelihood API needs the + likelihood of a provided continuation under a fixed context. Returning + a fabricated score would make multiple-choice likelihood tasks look + valid while producing misleading metrics. """ - results: List[Tuple[float, bool]] = [] - url = f"{self._ollama_api_base}/api/generate" - - for instance in tqdm(requests, disable=(self.rank != 0), desc="Loglikelihood"): - context, continuation = instance.args[0], instance.args[1] - full_text = context + continuation - - def _get_logprob(prompt: str) -> float: - payload = { - "model": self.model_version, - "prompt": prompt, - "stream": False, - "logprobs": True, - "options": {"temperature": 0}, - } - for attempt in range(self.max_retries): - try: - resp = http_requests.post(url, json=payload, timeout=self.timeout * 6) - resp.raise_for_status() - data = resp.json() - token_lps = data.get("logprobs") or [] - return float(sum(token_lps)) - except Exception as exc: - eval_logger.info(f"loglikelihood attempt {attempt + 1}/{self.max_retries} failed: {exc}") - if attempt < self.max_retries - 1: - time.sleep(self.retry_backoff_s) - return -math.inf - - full_lp = _get_logprob(full_text) - if math.isinf(full_lp): - results.append((-math.inf, False)) - continue - ctx_lp = _get_logprob(context) if context else 0.0 - continuation_lp = full_lp - ctx_lp - is_greedy = continuation_lp >= 0.0 - results.append((continuation_lp, is_greedy)) - - return results + raise NotImplementedError("Ollama loglikelihood is not supported; use generate_until tasks instead.") diff --git a/test/models/test_ollama.py b/test/models/test_ollama.py index 3b4ceb8c2..c20d39ea1 100644 --- a/test/models/test_ollama.py +++ b/test/models/test_ollama.py @@ -2,7 +2,6 @@ from __future__ import annotations -import math import sys import types import unittest @@ -35,21 +34,6 @@ def _make_ollama(model_version: str = "llava", **kwargs): return Ollama(model_version=model_version, **kwargs) -def _make_post_mock(responses: list[dict]) -> mock.MagicMock: - """MagicMock for http_requests.post returning each response dict in sequence.""" - side_effects = [] - for resp in responses: - r = mock.MagicMock() - r.raise_for_status = mock.MagicMock() - r.json.return_value = resp - side_effects.append(r) - return mock.MagicMock(side_effect=side_effects) - - -def _logprob_response(logprobs: list[float]) -> dict: - return {"logprobs": logprobs, "response": ""} - - def _make_instance(context: str, continuation: str) -> SimpleNamespace: return SimpleNamespace(args=(context, continuation), rank=0) @@ -94,99 +78,13 @@ def test_num_concurrent_default(self) -> None: class TestOllamaLoglikelihood(unittest.TestCase): - def test_loglikelihood_sums_continuation_tokens(self) -> None: + def test_loglikelihood_is_explicitly_unsupported(self) -> None: model = _make_ollama() - # full (ctx+cont) = -2.0, context-only = -1.0 → continuation = -1.0 - post_mock = _make_post_mock( - [ - _logprob_response([-1.0, -0.5, -0.5]), - _logprob_response([-0.8, -0.2]), - ] - ) instance = _make_instance("The sky is", " blue") - with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): - results = model.loglikelihood([instance]) - - lp, is_greedy = results[0] - self.assertAlmostEqual(lp, -1.0, places=5) - self.assertFalse(is_greedy) - - def test_loglikelihood_payload_fields(self) -> None: - """Verify the POST payload contains the required Ollama API fields.""" - model = _make_ollama(model_version="llava") - post_mock = _make_post_mock( - [ - _logprob_response([-1.0]), - _logprob_response([-0.5]), - ] - ) - instance = _make_instance("ctx", " cont") - with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): - model.loglikelihood([instance]) - first_call_kwargs = post_mock.call_args_list[0].kwargs - payload = first_call_kwargs["json"] - self.assertEqual(payload["model"], "llava") - self.assertTrue(payload["logprobs"]) - self.assertFalse(payload["stream"]) - self.assertEqual(payload["options"]["temperature"], 0) - self.assertIn("/api/generate", post_mock.call_args_list[0].args[0]) - - def test_loglikelihood_correct_prompts_sent(self) -> None: - """First call gets full text, second gets context only.""" - model = _make_ollama() - post_mock = _make_post_mock( - [ - _logprob_response([-1.0]), - _logprob_response([-0.5]), - ] - ) - instance = _make_instance("The sky is", " blue") - with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): + with self.assertRaisesRegex(NotImplementedError, "generate_until"): model.loglikelihood([instance]) - self.assertEqual(post_mock.call_count, 2) - self.assertEqual(post_mock.call_args_list[0].kwargs["json"]["prompt"], "The sky is blue") - self.assertEqual(post_mock.call_args_list[1].kwargs["json"]["prompt"], "The sky is") - - def test_loglikelihood_empty_context_skips_second_call(self) -> None: - model = _make_ollama() - post_mock = _make_post_mock([_logprob_response([-0.5, -0.5])]) - instance = _make_instance("", "hello") - with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): - results = model.loglikelihood([instance]) - - self.assertEqual(post_mock.call_count, 1) - lp, _ = results[0] - self.assertAlmostEqual(lp, -1.0, places=5) - - def test_loglikelihood_returns_neg_inf_on_failure(self) -> None: - model = _make_ollama() - model.max_retries = 2 - model.retry_backoff_s = 0.0 - - instance = _make_instance("ctx", " cont") - with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", side_effect=Exception("conn refused")): - results = model.loglikelihood([instance]) - - lp, _ = results[0] - self.assertEqual(lp, -math.inf) - - def test_loglikelihood_positive_score_is_greedy(self) -> None: - model = _make_ollama() - post_mock = _make_post_mock( - [ - _logprob_response([0.1, 0.2]), - _logprob_response([-0.5]), - ] - ) - instance = _make_instance("ctx", " cont") - with mock.patch("lmms_eval.models.chat.ollama.http_requests.post", post_mock): - results = model.loglikelihood([instance]) - - _, is_greedy = results[0] - self.assertTrue(is_greedy) - if __name__ == "__main__": unittest.main() From 6e17eaba28c543d932404a2b79493989e5c83aa1 Mon Sep 17 00:00:00 2001 From: elias Date: Tue, 5 May 2026 20:42:42 +0200 Subject: [PATCH 3/3] deleting notes --- OLLAMA_NEXT_STEPS.md | 128 ------------------------------------------- 1 file changed, 128 deletions(-) delete mode 100644 OLLAMA_NEXT_STEPS.md diff --git a/OLLAMA_NEXT_STEPS.md b/OLLAMA_NEXT_STEPS.md deleted file mode 100644 index 7768bcc61..000000000 --- a/OLLAMA_NEXT_STEPS.md +++ /dev/null @@ -1,128 +0,0 @@ -# Ollama Backend — Next Steps - -## Where we left off - -Branch: `feat/ollama-model` - -Two files were added: - -- `lmms_eval/models/chat/ollama.py` — the backend -- `test/models/test_ollama.py` — unit tests (10/10 passing, no Ollama required) - -One line was added to `lmms_eval/models/__init__.py`: -```python -AVAILABLE_CHAT_TEMPLATE_MODELS = { - "ollama": "Ollama", # ← added - ... -} -``` - ---- - -## What still needs to be done - -### 1. Install Ollama and pull a vision model - -```bash -# Install from https://ollama.com -ollama serve # start the server (runs on localhost:11434) -ollama pull llava # smallest vision model, good for smoke testing -ollama pull llava-llama3 # better quality if you have the VRAM -``` - -### 2. Run a live smoke test - -```bash -uv run python -m lmms_eval \ - --model ollama \ - --model_args model_version=llava \ - --tasks mme \ - --limit 8 -``` - -Expected: 8 samples evaluated, scores printed. If it errors, check: -- Is `ollama serve` running? -- Does `ollama list` show `llava`? -- Is the `logprobs` field actually present in `/api/generate` responses for your Ollama version? - -### 3. Verify loglikelihood against the real API - -The `loglikelihood` implementation uses `POST /api/generate` with `logprobs=True`. -This field was added in Ollama v0.1.38. Confirm it works: - -```bash -curl http://localhost:11434/api/generate -d '{ - "model": "llava", - "prompt": "The sky is blue", - "stream": false, - "logprobs": true -}' -``` - -The response should contain a `"logprobs"` array of floats. If it's missing or null, -the implementation will silently return `-inf` for all loglikelihood requests — tasks -that depend on it (e.g. multiple-choice scoring) will produce wrong results. - -### 4. Run pre-commit linting (required before PR) - -```bash -uv run pip install pre-commit -uv run pre-commit install -uv run pre-commit run --all-files -``` - -This runs Black (line length 240) + isort. Fix any formatting issues it flags. - -### 5. Commit - -```bash -git add lmms_eval/models/chat/ollama.py \ - lmms_eval/models/__init__.py \ - test/models/test_ollama.py -git commit -m "feat: add Ollama local inference backend" -``` - -### 6. Open a pull request - -The upstream repo is `EvolvingLMMs-Lab/lmms-eval`. You'll need to fork it if you -haven't already, push the branch, and open a PR against `main`. - -PR description should include: -- What model this adds and why (local inference, no API key, multimodal) -- Supported Ollama models (llava, llava-llama3, moondream, minicpm-v, ...) -- Known limitations (no video/audio, logprobs requires Ollama ≥ v0.1.38) -- Example usage command - ---- - -## Known limitations to mention in the PR - -| Feature | Status | -|---|---| -| Image input | Works (base64 via OpenAI-compat /v1) | -| Text-only models | Works | -| Video input | Not supported by Ollama | -| Audio input | Not supported by Ollama | -| `loglikelihood` | Works if Ollama ≥ v0.1.38 and model supports it | -| `generate_until_multi_round` | Inherited from OpenAI parent (raises NotImplementedError) | - ---- - -## Quick reference - -```bash -# Set up on new machine -git clone -cd lmms-eval -git checkout feat/ollama-model -uv sync - -# Run unit tests (no Ollama needed) -uv run python -m pytest test/models/test_ollama.py -v - -# Run live eval (Ollama must be running) -uv run python -m lmms_eval \ - --model ollama \ - --model_args model_version=llava \ - --tasks mme --limit 8 -```