Skip to content

Commit 02654cd

Browse files
feat: add --chat option to CLI and propagate chat mode to backend configuration
1 parent 10b16b2 commit 02654cd

2 files changed

Lines changed: 22 additions & 7 deletions

File tree

src/infer_check/backends/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def get_backend_for_model(
105105
base_url: str | None = None,
106106
quantization: str | None = None,
107107
disable_thinking: bool = True,
108+
chat: bool = True,
108109
) -> BackendAdapter:
109110
"""Resolve model string to a backend and instantiate it.
110111
@@ -122,6 +123,7 @@ def get_backend_for_model(
122123
quantization=quantization or resolved.label,
123124
hf_revision=resolved.revision,
124125
disable_thinking=disable_thinking,
126+
extra={"chat": chat},
125127
)
126128

127129
return get_backend(config)

src/infer_check/cli.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,16 @@ def common_options(f: F) -> F:
4646
"comparable across runs; pass --enable-thinking to restore it."
4747
),
4848
),
49+
click.option(
50+
"--chat/--no-chat",
51+
default=True,
52+
show_default=True,
53+
help=(
54+
"Use /v1/chat/completions for HTTP backends (applies chat "
55+
"template server-side). Pass --no-chat to use raw "
56+
"/v1/completions instead. Ignored for mlx-lm."
57+
),
58+
),
4959
]
5060
for option in reversed(options):
5161
f = option(f)
@@ -151,6 +161,7 @@ def sweep(
151161
max_tokens: int | None,
152162
num_prompts: int | None,
153163
disable_thinking: bool,
164+
chat: bool,
154165
) -> None:
155166
"""Run a quantization sweep: compare pre-quantized models against a baseline.
156167
@@ -207,6 +218,7 @@ def sweep(
207218
base_url=base_url,
208219
quantization=label,
209220
disable_thinking=disable_thinking,
221+
chat=chat,
210222
)
211223

212224
runner = TestRunner()
@@ -351,6 +363,7 @@ def compare(
351363
max_tokens: int | None,
352364
num_prompts: int | None,
353365
disable_thinking: bool,
366+
chat: bool,
354367
) -> None:
355368
"""Compare two quantizations of the same model.
356369
@@ -402,6 +415,7 @@ def compare(
402415
hf_revision=resolved_a.revision,
403416
base_url=resolved_a.base_url,
404417
disable_thinking=disable_thinking,
418+
extra={"chat": chat},
405419
)
406420
config_b = BackendConfig(
407421
backend_type=resolved_b.backend,
@@ -410,6 +424,7 @@ def compare(
410424
hf_revision=resolved_b.revision,
411425
base_url=resolved_b.base_url,
412426
disable_thinking=disable_thinking,
427+
extra={"chat": chat},
413428
)
414429
backend_a = get_backend(config_a)
415430
backend_b = get_backend(config_b)
@@ -591,12 +606,6 @@ def compare(
591606
default=None,
592607
help="Comma-separated base URLs for HTTP backends (positionally matched to --backends).",
593608
)
594-
@click.option(
595-
"--chat/--no-chat",
596-
default=True,
597-
show_default=True,
598-
help="Use /v1/chat/completions for HTTP backends (applies chat template server-side).",
599-
)
600609
@common_options
601610
@click.pass_context
602611
def diff(
@@ -607,10 +616,10 @@ def diff(
607616
output: Path,
608617
quant: str | None,
609618
base_urls: str | None,
610-
chat: bool,
611619
max_tokens: int | None,
612620
num_prompts: int | None,
613621
disable_thinking: bool,
622+
chat: bool,
614623
) -> None:
615624
"""Compare outputs across different backends for the same model and prompts."""
616625
from infer_check.backends.base import BackendConfig, get_backend
@@ -741,6 +750,7 @@ def stress(
741750
max_tokens: int | None,
742751
num_prompts: int | None,
743752
disable_thinking: bool,
753+
chat: bool,
744754
) -> None:
745755
"""Stress-test a backend with varying concurrency levels."""
746756
from infer_check.backends.base import get_backend_for_model
@@ -755,6 +765,7 @@ def stress(
755765
backend_type=backend,
756766
base_url=base_url,
757767
disable_thinking=disable_thinking,
768+
chat=chat,
758769
)
759770

760771
console.print(
@@ -832,6 +843,7 @@ def determinism(
832843
max_tokens: int | None,
833844
num_prompts: int | None,
834845
disable_thinking: bool,
846+
chat: bool,
835847
) -> None:
836848
"""Test whether a backend produces identical outputs across repeated runs at temperature=0."""
837849
from infer_check.backends.base import get_backend_for_model
@@ -844,6 +856,7 @@ def determinism(
844856
backend_type=backend,
845857
base_url=base_url,
846858
disable_thinking=disable_thinking,
859+
chat=chat,
847860
)
848861

849862
console.print(f"[bold cyan]determinism[/bold cyan] model={model} backend={backend_instance.name} runs={runs}")

0 commit comments

Comments
 (0)