3333from transformers import AutoTokenizer
3434from transformers .utils import logging as transformers_logging
3535
36+ from inference_endpoint .commands .utils import get_default_report_path
3637from inference_endpoint .config .runtime_settings import RuntimeSettings
3738from inference_endpoint .config .schema import (
3839 BenchmarkConfig ,
@@ -267,7 +268,11 @@ def _build_config_from_cli(
267268 load_pattern_type = LoadPatternType .CONCURRENCY
268269 case "online" :
269270 load_pattern_type = LoadPatternType .POISSON
270- report_dir = getattr (args , "report_dir" , None )
271+ report_dir = getattr (
272+ args ,
273+ "report_dir" ,
274+ get_default_report_path (),
275+ )
271276 timeout = getattr (args , "timeout" , None )
272277 verbose = getattr (args , "verbose" , False )
273278 output = getattr (args , "output" , None )
@@ -457,8 +462,11 @@ def _run_benchmark(
457462
458463 if config .report_dir :
459464 report_dir = Path (config .report_dir )
460- report_dir .mkdir (parents = True , exist_ok = True )
461- config .to_yaml_file (report_dir / "config.yaml" )
465+ else :
466+ report_dir = get_default_report_path ()
467+
468+ report_dir .mkdir (parents = True , exist_ok = True )
469+ config .to_yaml_file (report_dir / "config.yaml" )
462470
463471 max_tokens = config .model_params .max_new_tokens
464472
@@ -584,6 +592,8 @@ def _run_benchmark(
584592 endpoint_url = urljoin (endpoint , "/v1/chat/completions" ),
585593 num_workers = num_workers ,
586594 max_concurrency = - 1 , # unlimited
595+ record_worker_events = config .settings .client .record_worker_events ,
596+ event_logs_dir = report_dir ,
587597 )
588598 aiohttp_config = AioHttpConfig ()
589599 zmq_config = ZMQConfig (
@@ -615,7 +625,7 @@ def _run_benchmark(
615625 scheduler ,
616626 name = f"cli_benchmark_{ uuid .uuid4 ().hex [0 :8 ]} " ,
617627 stop_sample_issuer_on_test_end = False ,
618- report_dir = config . report_dir ,
628+ report_dir = report_dir ,
619629 tokenizer_override = tokenizer ,
620630 max_shutdown_timeout_s = config .timeout if config .timeout else None ,
621631 )
@@ -636,9 +646,7 @@ def signal_handler(signum, frame):
636646
637647 elapsed_time = time .time () - start_time
638648 success_count = response_collector .count - len (response_collector .errors )
639- estimated_qps = (
640- response_collector .count / elapsed_time if elapsed_time > 0 else 0
641- )
649+ estimated_qps = success_count / elapsed_time if elapsed_time > 0 else 0
642650
643651 # Report results
644652 logger .info (f"Completed in { elapsed_time :.1f} s" )
0 commit comments