feat: propagate hf_revision to diff command and clarify Ollama chat handling comments

NullPointerDepressiveDisorder · NullPointerDepressiveDisorder · commit 10b16b21a0c2 · 2026-04-19T17:01:57.000-07:00
diff --git a/src/infer_check/backends/openai_compat.py b/src/infer_check/backends/openai_compat.py
@@ -52,10 +52,10 @@ def __init__(
         self._chat = chat
         self._revision = revision
         self._disable_thinking = disable_thinking
-        # Ollama listens on :11434 by default. When we're talking to Ollama and
-        # thinking is disabled, we prepend "/no_think" to the user message — a
-        # directive that Qwen3 and some Gemma/Ollama templates honour even when
-        # the top-level `think` field is ignored.
+        # Ollama listens on :11434 by default. Track it so later request
+        # handling can apply Ollama-specific chat behavior when thinking is
+        # disabled (for example, using request flags and stripping think
+        # tokens from responses) rather than relying on prompt rewriting.
         self._is_ollama = ":11434" in self._base_url
 
         headers: dict[str, str] = {}
diff --git a/src/infer_check/cli.py b/src/infer_check/cli.py
@@ -399,13 +399,15 @@ def compare(
         backend_type=resolved_a.backend,
         model_id=resolved_a.model_id,
         quantization=resolved_a.label,
+        hf_revision=resolved_a.revision,
         base_url=resolved_a.base_url,
         disable_thinking=disable_thinking,
     )
     config_b = BackendConfig(
         backend_type=resolved_b.backend,
         model_id=resolved_b.model_id,
         quantization=resolved_b.label,
+        hf_revision=resolved_b.revision,
         base_url=resolved_b.base_url,
         disable_thinking=disable_thinking,
     )