Bug/max tokens (#10)

NullPointerDepressiveDisorder · Copilot · web-flow · commit 846331be926f · 2026-04-06T11:07:21.000-07:00
* feat: add global --max-tokens CLI flag and increase default max_tokens to 1024

* fix: handle missing content by falling back to reasoning_content in OpenAI compatibility

* Apply suggestions from code review

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* fix: ensure prompt-level max_tokens override global CLI flag and add tests for precedence

- Track explicitly set fields in prompt metadata to distinguish prompt-level max_tokens
- Only apply global --max-tokens if max_tokens not set in prompt JSONL
- Add tests to verify prompt-level max_tokens take precedence over global flag
- Make model cleanup in mlx_lm async-safe

* fix: use model_fields_set for max_tokens override detection and update tests

- Replace usage of metadata["__fields_set__"] with model_fields_set to check if max_tokens is set in prompt
- Remove manual tracking of __fields_set__ in loader.py
- Update test_cli_max_tokens.py to use SweepResult instead of MagicMock for mock results
- Change ruff target-version to py311 in pyproject.toml

* fix: set correct default and type for --max-tokens CLI option, update python_version in mypy config

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+# 0.2.0 (2026-04-02)
+
+- Added global `--max-tokens` flag (defaults to 1024) to the main CLI.
+- Increased default `max_tokens` for all prompts from 256 to 1024.
+
 # 0.1.0 (2026-03-11)
 
 - Initial release
diff --git a/README.md b/README.md
@@ -68,7 +68,7 @@ pip install "infer-check[mlx]"
 
 ### Quantization sweep
 
-Compare pre-quantized models against a baseline. Each model is a separate HuggingFace repo.
+Compare pre-quantized models against a baseline. Each model is a separate HuggingFace repo. Use `--max-tokens` to control generation length (defaults to 1024).
 
 ```
 infer-check sweep \
@@ -77,6 +77,7 @@ infer-check sweep \
             4bit=mlx-community/Meta-Llama-3.1-8B-Instruct-4bit" \
   --backend mlx-lm \
   --prompts reasoning \
+  --max-tokens 512 \
   --output ./results/sweep/
 ```
 
@@ -161,7 +162,7 @@ Curated prompts targeting known quantization failure modes:
 | `quant-sensitive.jsonl` | 20 | Multi-digit arithmetic, long CoT, precise syntax |
 | `determinism.jsonl` | 50 | High-entropy continuations for determinism testing |
 
-All suites ship with the package — no need to clone the repo. Custom suites are JSONL files with one object per line:
+All suites ship with the package — no need to clone the repo. Custom suites are JSONL files with one object per line (default `max_tokens` is 1024):
 
 ```json
 {"id": "custom-001", "text": "Your prompt here", "category": "math", "max_tokens": 512}
diff --git a/pyproject.toml b/pyproject.toml
@@ -65,7 +65,7 @@ dev = [
 ]
 
 [tool.ruff]
-target-version = "py313"
+target-version = "py311"
 line-length = 120
 src = ["src"]
 
@@ -76,7 +76,7 @@ select = ["E", "F", "I", "N", "W", "UP", "B", "SIM"]
 "html.py" = ["E501"]
 
 [tool.mypy]
-python_version = "3.13"
+python_version = "3.11"
 strict = true
 
 [tool.pytest.ini_options]
diff --git a/src/infer_check/backends/mlx_lm.py b/src/infer_check/backends/mlx_lm.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import asyncio
 import gc
 import time
 from typing import Any, cast
@@ -85,7 +86,7 @@ async def cleanup(self) -> None:
         """Release model references and trigger garbage collection."""
         self._model = None
         self._tokenizer = None
-        gc.collect()
+        await asyncio.to_thread(gc.collect)
 
     # ------------------------------------------------------------------
     # Internal helpers
diff --git a/src/infer_check/backends/openai_compat.py b/src/infer_check/backends/openai_compat.py
@@ -101,6 +101,8 @@ async def _generate_chat(self, prompt: Prompt) -> InferenceResult:
         choice = data["choices"][0]
         message = choice.get("message", {})
         text: str = message.get("content", "")
+        if not text:
+            text = message.get("reasoning_content", "")
         tokens = text.split()
 
         usage = data.get("usage", {})
diff --git a/src/infer_check/cli.py b/src/infer_check/cli.py
@@ -4,16 +4,34 @@
 
 import asyncio
 import json
+from collections.abc import Callable
 from datetime import UTC, datetime
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, TypeVar
 
 import click
 from rich.console import Console
 from rich.table import Table
 
 console = Console()
 
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+def common_options(f: F) -> F:
+    """Add common options to all subcommands."""
+    options = [
+        click.option(
+            "--max-tokens",
+            default=None,
+            type=click.IntRange(min=1, clamp=True),
+            help="Override default max tokens for generation.",
+        ),
+    ]
+    for option in reversed(options):
+        f = option(f)
+    return f
+
 
 def _resolve_prompts(prompts: str) -> Path:
     """Resolve a prompt suite name or path to an actual file path."""
@@ -27,8 +45,17 @@ def _resolve_prompts(prompts: str) -> Path:
 
 @click.group()
 @click.version_option(package_name="infer-check")
-def main() -> None:
+@click.option(
+    "--max-tokens",
+    default=1024,
+    show_default=True,
+    help="Default max tokens for generation (applies to all prompts unless they specify their own).",
+)
+@click.pass_context
+def main(ctx: click.Context, max_tokens: int) -> None:
     """infer-check: correctness and reliability testing for LLM inference engines."""
+    ctx.ensure_object(dict)
+    ctx.obj["max_tokens"] = max_tokens
 
 
 # ---------------------------------------------------------------------------
@@ -65,13 +92,17 @@ def main() -> None:
     help="Baseline label (defaults to first in --models).",
 )
 @click.option("--base-url", default=None, help="Base URL for HTTP backends.")
+@common_options
+@click.pass_context
 def sweep(
+    ctx: click.Context,
     models: str,
     backend: str | None,
     prompts: str,
     output: Path,
     baseline: str | None,
     base_url: str | None,
+    max_tokens: int | None,
 ) -> None:
     """Run a quantization sweep: compare pre-quantized models against a baseline.
 
@@ -90,6 +121,10 @@ def sweep(
     from infer_check.runner import TestRunner
     from infer_check.suites.loader import load_suite
 
+    # Update max_tokens from subcommand if provided
+    if max_tokens is not None:
+        ctx.obj["max_tokens"] = max_tokens
+
     # Parse label=model_path pairs
     model_map: dict[str, str] = {}
     for entry in models.split(","):
@@ -117,8 +152,11 @@ def sweep(
     for label, path in model_map.items():
         tag = " (baseline)" if label == baseline_label else ""
         console.print(f"  {label}: {path}{tag}")
-
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens only if not explicitly set in the prompt JSONL
+    for p in prompt_list:
+        if "max_tokens" not in p.model_fields_set:
+            p.max_tokens = ctx.obj["max_tokens"]
 
     # Build a separate backend for each model
     backend_map: dict[str, Any] = {}
@@ -257,7 +295,10 @@ def sweep(
     show_default=True,
     help="Generate an HTML comparison report after the run.",
 )
+@common_options
+@click.pass_context
 def compare(
+    ctx: click.Context,
     model_a: str,
     model_b: str,
     prompts: str,
@@ -266,6 +307,7 @@ def compare(
     label_a: str | None,
     label_b: str | None,
     report: bool,
+    max_tokens: int | None,
 ) -> None:
     """Compare two quantizations of the same model.
 
@@ -295,6 +337,10 @@ def compare(
     from infer_check.runner import TestRunner
     from infer_check.suites.loader import load_suite
 
+    # Update max_tokens from subcommand if provided
+    if max_tokens is not None:
+        ctx.obj["max_tokens"] = max_tokens
+
     resolved_a = resolve_model(model_a, base_url=base_url, label=label_a)
     resolved_b = resolve_model(model_b, base_url=base_url, label=label_b)
 
@@ -305,6 +351,11 @@ def compare(
     )
 
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens only if not explicitly set in the prompt JSONL
+    for p in prompt_list:
+        if "max_tokens" not in p.model_fields_set:
+            p.max_tokens = ctx.obj["max_tokens"]
+
     console.print(f"  prompts: {len(prompt_list)} from '{prompts}'")
 
     # ── Build backends ───────────────────────────────────────────────
@@ -510,20 +561,28 @@ def compare(
     show_default=True,
     help="Use /v1/chat/completions for HTTP backends (applies chat template server-side).",
 )
+@common_options
+@click.pass_context
 def diff(
+    ctx: click.Context,
     model: str,
     backends: str,
     prompts: str,
     output: Path,
     quant: str | None,
     base_urls: str | None,
     chat: bool,
+    max_tokens: int | None,
 ) -> None:
     """Compare outputs across different backends for the same model and prompts."""
     from infer_check.backends.base import BackendConfig, get_backend
     from infer_check.runner import TestRunner
     from infer_check.suites.loader import load_suite
 
+    # Update max_tokens from subcommand if provided
+    if max_tokens is not None:
+        ctx.obj["max_tokens"] = max_tokens
+
     backend_names = [b.strip() for b in backends.split(",") if b.strip()]
     url_list: list[str | None] = [u.strip() for u in base_urls.split(",")] if base_urls else [None] * len(backend_names)
     # Pad url_list if shorter than backend_names
@@ -533,6 +592,10 @@ def diff(
     console.print(f"[bold cyan]diff[/bold cyan] model={model} backends={backend_names} quant={quant}")
 
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens only if not explicitly set in the prompt JSONL
+    for p in prompt_list:
+        if "max_tokens" not in p.model_fields_set:
+            p.max_tokens = ctx.obj["max_tokens"]
 
     backend_instances = []
     for name, url in zip(backend_names, url_list, strict=True):
@@ -619,19 +682,27 @@ def diff(
     help="Comma-separated concurrency levels.",
 )
 @click.option("--base-url", default=None, help="Base URL for HTTP backends.")
+@common_options
+@click.pass_context
 def stress(
+    ctx: click.Context,
     model: str,
     backend: str | None,
     prompts: str,
     output: Path,
     concurrency: str,
     base_url: str | None,
+    max_tokens: int | None,
 ) -> None:
     """Stress-test a backend with varying concurrency levels."""
     from infer_check.backends.base import get_backend_for_model
     from infer_check.runner import TestRunner
     from infer_check.suites.loader import load_suite
 
+    # Update max_tokens from subcommand if provided
+    if max_tokens is not None:
+        ctx.obj["max_tokens"] = max_tokens
+
     concurrency_levels = [int(c.strip()) for c in concurrency.split(",") if c.strip()]
 
     backend_instance = get_backend_for_model(
@@ -645,6 +716,10 @@ def stress(
     )
 
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens only if not explicitly set in the prompt JSONL
+    for p in prompt_list:
+        if "max_tokens" not in p.model_fields_set:
+            p.max_tokens = ctx.obj["max_tokens"]
 
     runner = TestRunner()
     stress_results = asyncio.run(
@@ -704,19 +779,27 @@ def stress(
 )
 @click.option("--runs", default=100, show_default=True, type=int, help="Number of runs per prompt.")
 @click.option("--base-url", default=None, help="Base URL for HTTP backends.")
+@common_options
+@click.pass_context
 def determinism(
+    ctx: click.Context,
     model: str,
     backend: str | None,
     prompts: str,
     output: Path,
     runs: int,
     base_url: str | None,
+    max_tokens: int | None,
 ) -> None:
     """Test whether a backend produces identical outputs across repeated runs at temperature=0."""
     from infer_check.backends.base import get_backend_for_model
     from infer_check.runner import TestRunner
     from infer_check.suites.loader import load_suite
 
+    # Update max_tokens from subcommand if provided
+    if max_tokens is not None:
+        ctx.obj["max_tokens"] = max_tokens
+
     backend_instance = get_backend_for_model(
         model_str=model,
         backend_type=backend,
@@ -726,6 +809,10 @@ def determinism(
     console.print(f"[bold cyan]determinism[/bold cyan] model={model} backend={backend_instance.name} runs={runs}")
 
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens only if not explicitly set in the prompt JSONL
+    for p in prompt_list:
+        if "max_tokens" not in p.model_fields_set:
+            p.max_tokens = ctx.obj["max_tokens"]
 
     runner = TestRunner()
     det_results = asyncio.run(
diff --git a/src/infer_check/types.py b/src/infer_check/types.py
@@ -48,7 +48,7 @@ class Prompt(BaseInferModel):
     id: str = Field(default_factory=_generate_uuid)
     text: str
     category: str = "general"
-    max_tokens: int = 256
+    max_tokens: int = 1024
     metadata: dict[str, Any] = Field(default_factory=dict)
 
 
diff --git a/tests/unit/test_cli_max_tokens.py b/tests/unit/test_cli_max_tokens.py
diff --git a/tests/unit/test_openai_compat.py b/tests/unit/test_openai_compat.py