@@ -46,6 +46,16 @@ def common_options(f: F) -> F:
4646 "comparable across runs; pass --enable-thinking to restore it."
4747 ),
4848 ),
49+ click .option (
50+ "--chat/--no-chat" ,
51+ default = True ,
52+ show_default = True ,
53+ help = (
54+ "Use /v1/chat/completions for HTTP backends (applies chat "
55+ "template server-side). Pass --no-chat to use raw "
56+ "/v1/completions instead. Ignored for mlx-lm."
57+ ),
58+ ),
4959 ]
5060 for option in reversed (options ):
5161 f = option (f )
@@ -151,6 +161,7 @@ def sweep(
151161 max_tokens : int | None ,
152162 num_prompts : int | None ,
153163 disable_thinking : bool ,
164+ chat : bool ,
154165) -> None :
155166 """Run a quantization sweep: compare pre-quantized models against a baseline.
156167
@@ -207,6 +218,7 @@ def sweep(
207218 base_url = base_url ,
208219 quantization = label ,
209220 disable_thinking = disable_thinking ,
221+ chat = chat ,
210222 )
211223
212224 runner = TestRunner ()
@@ -351,6 +363,7 @@ def compare(
351363 max_tokens : int | None ,
352364 num_prompts : int | None ,
353365 disable_thinking : bool ,
366+ chat : bool ,
354367) -> None :
355368 """Compare two quantizations of the same model.
356369
@@ -402,6 +415,7 @@ def compare(
402415 hf_revision = resolved_a .revision ,
403416 base_url = resolved_a .base_url ,
404417 disable_thinking = disable_thinking ,
418+ extra = {"chat" : chat },
405419 )
406420 config_b = BackendConfig (
407421 backend_type = resolved_b .backend ,
@@ -410,6 +424,7 @@ def compare(
410424 hf_revision = resolved_b .revision ,
411425 base_url = resolved_b .base_url ,
412426 disable_thinking = disable_thinking ,
427+ extra = {"chat" : chat },
413428 )
414429 backend_a = get_backend (config_a )
415430 backend_b = get_backend (config_b )
@@ -591,12 +606,6 @@ def compare(
591606 default = None ,
592607 help = "Comma-separated base URLs for HTTP backends (positionally matched to --backends)." ,
593608)
594- @click .option (
595- "--chat/--no-chat" ,
596- default = True ,
597- show_default = True ,
598- help = "Use /v1/chat/completions for HTTP backends (applies chat template server-side)." ,
599- )
600609@common_options
601610@click .pass_context
602611def diff (
@@ -607,10 +616,10 @@ def diff(
607616 output : Path ,
608617 quant : str | None ,
609618 base_urls : str | None ,
610- chat : bool ,
611619 max_tokens : int | None ,
612620 num_prompts : int | None ,
613621 disable_thinking : bool ,
622+ chat : bool ,
614623) -> None :
615624 """Compare outputs across different backends for the same model and prompts."""
616625 from infer_check .backends .base import BackendConfig , get_backend
@@ -741,6 +750,7 @@ def stress(
741750 max_tokens : int | None ,
742751 num_prompts : int | None ,
743752 disable_thinking : bool ,
753+ chat : bool ,
744754) -> None :
745755 """Stress-test a backend with varying concurrency levels."""
746756 from infer_check .backends .base import get_backend_for_model
@@ -755,6 +765,7 @@ def stress(
755765 backend_type = backend ,
756766 base_url = base_url ,
757767 disable_thinking = disable_thinking ,
768+ chat = chat ,
758769 )
759770
760771 console .print (
@@ -832,6 +843,7 @@ def determinism(
832843 max_tokens : int | None ,
833844 num_prompts : int | None ,
834845 disable_thinking : bool ,
846+ chat : bool ,
835847) -> None :
836848 """Test whether a backend produces identical outputs across repeated runs at temperature=0."""
837849 from infer_check .backends .base import get_backend_for_model
@@ -844,6 +856,7 @@ def determinism(
844856 backend_type = backend ,
845857 base_url = base_url ,
846858 disable_thinking = disable_thinking ,
859+ chat = chat ,
847860 )
848861
849862 console .print (f"[bold cyan]determinism[/bold cyan] model={ model } backend={ backend_instance .name } runs={ runs } " )
0 commit comments