feat: add global --max-tokens CLI flag and increase default max_tokens to 1024

NullPointerDepressiveDisorder · NullPointerDepressiveDisorder · commit 50210eeb3e3c · 2026-04-02T20:48:41.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+# 0.2.0 (2026-04-02)
+
+- Added global `--max-tokens` flag (defaults to 1024) to the main CLI.
+- Increased default `max_tokens` for all prompts from 256 to 1024.
+
 # 0.1.0 (2026-03-11)
 
 - Initial release
diff --git a/README.md b/README.md
@@ -68,7 +68,7 @@ pip install "infer-check[mlx]"
 
 ### Quantization sweep
 
-Compare pre-quantized models against a baseline. Each model is a separate HuggingFace repo.
+Compare pre-quantized models against a baseline. Each model is a separate HuggingFace repo. Use `--max-tokens` to control generation length (defaults to 1024).
 
 ```
 infer-check sweep \
@@ -77,6 +77,7 @@ infer-check sweep \
             4bit=mlx-community/Meta-Llama-3.1-8B-Instruct-4bit" \
   --backend mlx-lm \
   --prompts reasoning \
+  --max-tokens 512 \
   --output ./results/sweep/
 ```
 
@@ -161,7 +162,7 @@ Curated prompts targeting known quantization failure modes:
 | `quant-sensitive.jsonl` | 20 | Multi-digit arithmetic, long CoT, precise syntax |
 | `determinism.jsonl` | 50 | High-entropy continuations for determinism testing |
 
-All suites ship with the package — no need to clone the repo. Custom suites are JSONL files with one object per line:
+All suites ship with the package — no need to clone the repo. Custom suites are JSONL files with one object per line (default `max_tokens` is 1024):
 
 ```json
 {"id": "custom-001", "text": "Your prompt here", "category": "math", "max_tokens": 512}
diff --git a/src/infer_check/cli.py b/src/infer_check/cli.py
@@ -27,8 +27,17 @@ def _resolve_prompts(prompts: str) -> Path:
 
 @click.group()
 @click.version_option(package_name="infer-check")
-def main() -> None:
+@click.option(
+    "--max-tokens",
+    default=1024,
+    show_default=True,
+    help="Default max tokens for generation (applies to all prompts unless they specify their own).",
+)
+@click.pass_context
+def main(ctx: click.Context, max_tokens: int) -> None:
     """infer-check: correctness and reliability testing for LLM inference engines."""
+    ctx.ensure_object(dict)
+    ctx.obj["max_tokens"] = max_tokens
 
 
 # ---------------------------------------------------------------------------
@@ -65,7 +74,9 @@ def main() -> None:
     help="Baseline label (defaults to first in --models).",
 )
 @click.option("--base-url", default=None, help="Base URL for HTTP backends.")
+@click.pass_context
 def sweep(
+    ctx: click.Context,
     models: str,
     backend: str | None,
     prompts: str,
@@ -117,8 +128,10 @@ def sweep(
     for label, path in model_map.items():
         tag = " (baseline)" if label == baseline_label else ""
         console.print(f"  {label}: {path}{tag}")
-
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens
+    for p in prompt_list:
+        p.max_tokens = ctx.obj["max_tokens"]
 
     # Build a separate backend for each model
     backend_map: dict[str, Any] = {}
@@ -257,7 +270,9 @@ def sweep(
     show_default=True,
     help="Generate an HTML comparison report after the run.",
 )
+@click.pass_context
 def compare(
+    ctx: click.Context,
     model_a: str,
     model_b: str,
     prompts: str,
@@ -305,6 +320,10 @@ def compare(
     )
 
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens
+    for p in prompt_list:
+        p.max_tokens = ctx.obj["max_tokens"]
+
     console.print(f"  prompts: {len(prompt_list)} from '{prompts}'")
 
     # ── Build backends ───────────────────────────────────────────────
@@ -510,7 +529,9 @@ def compare(
     show_default=True,
     help="Use /v1/chat/completions for HTTP backends (applies chat template server-side).",
 )
+@click.pass_context
 def diff(
+    ctx: click.Context,
     model: str,
     backends: str,
     prompts: str,
@@ -533,6 +554,9 @@ def diff(
     console.print(f"[bold cyan]diff[/bold cyan] model={model} backends={backend_names} quant={quant}")
 
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens
+    for p in prompt_list:
+        p.max_tokens = ctx.obj["max_tokens"]
 
     backend_instances = []
     for name, url in zip(backend_names, url_list, strict=True):
@@ -619,7 +643,9 @@ def diff(
     help="Comma-separated concurrency levels.",
 )
 @click.option("--base-url", default=None, help="Base URL for HTTP backends.")
+@click.pass_context
 def stress(
+    ctx: click.Context,
     model: str,
     backend: str | None,
     prompts: str,
@@ -645,6 +671,9 @@ def stress(
     )
 
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens
+    for p in prompt_list:
+        p.max_tokens = ctx.obj["max_tokens"]
 
     runner = TestRunner()
     stress_results = asyncio.run(
@@ -704,7 +733,9 @@ def stress(
 )
 @click.option("--runs", default=100, show_default=True, type=int, help="Number of runs per prompt.")
 @click.option("--base-url", default=None, help="Base URL for HTTP backends.")
+@click.pass_context
 def determinism(
+    ctx: click.Context,
     model: str,
     backend: str | None,
     prompts: str,
@@ -726,6 +757,9 @@ def determinism(
     console.print(f"[bold cyan]determinism[/bold cyan] model={model} backend={backend_instance.name} runs={runs}")
 
     prompt_list = load_suite(_resolve_prompts(prompts))
+    # Apply global max_tokens
+    for p in prompt_list:
+        p.max_tokens = ctx.obj["max_tokens"]
 
     runner = TestRunner()
     det_results = asyncio.run(
diff --git a/src/infer_check/types.py b/src/infer_check/types.py
@@ -48,7 +48,7 @@ class Prompt(BaseInferModel):
     id: str = Field(default_factory=_generate_uuid)
     text: str
     category: str = "general"
-    max_tokens: int = 256
+    max_tokens: int = 1024
     metadata: dict[str, Any] = Field(default_factory=dict)