11import asyncio
2+ import csv
23import json
34import re
45import sys
@@ -527,6 +528,59 @@ async def _list(round_number: int, schedule: Schedule, current_round: int, exclu
527528 # Always display as table
528529 _display_commitments_table (results , round_number )
529530
531+
532+ def _display_containers_table (
533+ successful_containers : list [tuple [str , str ]], failed_containers : list [tuple [str , str ]]
534+ ) -> None :
535+ """Display successful and failed containers in formatted tables."""
536+ # Display successful containers
537+ if successful_containers :
538+ click .echo (f"\n { '=' * 120 } " , err = True )
539+ click .echo (f"Successful Containers ({ len (successful_containers )} )" , err = True )
540+ click .echo (f"{ '=' * 120 } " , err = True )
541+
542+ # Calculate column widths
543+ max_name_len = max (len (name ) for name , _ in successful_containers ) if successful_containers else 0
544+ max_url_len = max (len (url ) for _ , url in successful_containers ) if successful_containers else 0
545+ name_width = max (max_name_len , len ("Container Name" ))
546+ url_width = max (max_url_len , len ("URL" ))
547+
548+ # Table header
549+ header = f"{ 'ID' :<4} { 'Container Name' :<{name_width }} { 'URL' :<{url_width }} "
550+ click .echo (header , err = True )
551+ click .echo ("-" * 120 , err = True )
552+
553+ # Table rows
554+ for idx , (name , url ) in enumerate (successful_containers , 1 ):
555+ row = f"{ idx :<4} { name :<{name_width }} { url :<{url_width }} "
556+ click .echo (row , err = True )
557+
558+ click .echo (f"{ '=' * 120 } \n " , err = True )
559+
560+ # Display failed containers
561+ if failed_containers :
562+ click .echo (f"{ '=' * 120 } " , err = True )
563+ click .echo (f"Failed Containers ({ len (failed_containers )} )" , err = True )
564+ click .echo (f"{ '=' * 120 } " , err = True )
565+
566+ # Calculate column widths
567+ max_name_len = max (len (name ) for name , _ in failed_containers ) if failed_containers else 0
568+ max_error_len = max (len (error ) for _ , error in failed_containers ) if failed_containers else 0
569+ name_width = max (max_name_len , len ("Container Name" ))
570+ error_width = max (max_error_len , len ("Error" ))
571+
572+ # Table header
573+ header = f"{ 'ID' :<4} { 'Container Name' :<{name_width }} { 'Error' :<{error_width }} "
574+ click .echo (header , err = True )
575+ click .echo ("-" * 120 , err = True )
576+
577+ # Table rows
578+ for idx , (name , error ) in enumerate (failed_containers , 1 ):
579+ row = f"{ idx :<4} { name :<{name_width }} { error :<{error_width }} "
580+ click .echo (row , err = True )
581+
582+ click .echo (f"{ '=' * 120 } \n " , err = True )
583+
530584
531585def _display_commitments_table (results : list [dict ], round_number : int ) -> None :
532586 """Display commitments in a formatted table."""
@@ -638,7 +692,9 @@ def _parse_commitments(commitments: dict, round_number: int, schedule: Schedule,
638692 help = "HuggingFace token to pass as HF_TOKEN environment variable" ,
639693)
640694@click .option ("--name" , "container_name" , default = None , help = "Custom container name (default: generator)" )
641- def start_generator_cmd (image_url : str , targon_api_key : str , hf_token : str | None , container_name : str | None ) -> None :
695+ @click .option ("--count" , default = 1 , help = "Number of models to generate (default: 1)" )
696+ @click .option ("--output-file" , default = "generators.csv" , help = "Path to the CSV file where container names and URLs will be saved (default: generators.csv)" )
697+ def start_generator_cmd (image_url : str , targon_api_key : str , hf_token : str | None , container_name : str | None , count : float , output_file : str ) -> None :
642698 """Start the generator container."""
643699 click .echo (f"Starting generator: { image_url } " , err = True )
644700
@@ -648,24 +704,78 @@ def start_generator_cmd(image_url: str, targon_api_key: str, hf_token: str | Non
648704 env = {"HF_TOKEN" : hf_token }
649705
650706 # Format container name: "generator_{name}" if name provided, otherwise use default
651- if container_name :
652- name = f"generator_{ container_name } "
653- else :
654- name = _GENERATOR_POD_NAME
655-
656- container_url = asyncio .run (
657- _create_container (
658- image_url = image_url ,
659- container_name = name ,
660- targon_api_key = targon_api_key ,
661- resource_name = "h200-small" ,
662- port = _GENERATOR_PORT ,
663- health_check_path = _GENERATOR_HEALTH_CHECK_PATH ,
664- echo = lambda msg : click .echo (msg , err = True ),
665- env = env ,
707+ container_names : list [str ] = []
708+ for idx in range (int (count )):
709+ if container_name :
710+ name = f"generator_{ container_name } _{ idx } "
711+ else :
712+ name = f"{ _GENERATOR_POD_NAME } _{ idx } "
713+ container_names .append (name )
714+
715+ # Create async function to run tasks
716+ async def _create_all_containers () -> list [str | Exception ]:
717+ """Create all containers concurrently using tasks."""
718+ tasks : list [asyncio .Task [str ]] = []
719+ for name in container_names :
720+ task = asyncio .create_task (
721+ _create_container (
722+ image_url = image_url ,
723+ container_name = name ,
724+ targon_api_key = targon_api_key ,
725+ resource_name = "h200-small" ,
726+ port = _GENERATOR_PORT ,
727+ health_check_path = _GENERATOR_HEALTH_CHECK_PATH ,
728+ echo = lambda msg : click .echo (msg , err = True ),
729+ env = env ,
730+ )
731+ )
732+ tasks .append (task )
733+
734+ # Gather results, allowing some tasks to fail
735+ return await asyncio .gather (* tasks , return_exceptions = True )
736+
737+ # Run the async function
738+ results = asyncio .run (_create_all_containers ())
739+
740+ # Separate successful and failed containers
741+ successful_containers : list [tuple [str , str ]] = []
742+ failed_containers : list [tuple [str , str ]] = []
743+
744+ for name , result in zip (container_names , results ):
745+ if isinstance (result , Exception ):
746+ error_msg = str (result )
747+ failed_containers .append ((name , error_msg ))
748+ logger .error (f"Failed to create container { name } : { error_msg } " )
749+ else :
750+ successful_containers .append ((name , result ))
751+
752+ # Display results in tables
753+ _display_containers_table (successful_containers , failed_containers )
754+
755+ # Write successful containers to CSV file
756+ with Path (output_file ).open ("w" , newline = "" ) as f :
757+ writer = csv .writer (f )
758+ writer .writerow (["container_name" , "url" ])
759+ for name , url in successful_containers :
760+ writer .writerow ([name , url ])
761+
762+ # Report results
763+ if failed_containers :
764+ click .echo (
765+ json .dumps (
766+ {
767+ "success" : len (successful_containers ) > 0 ,
768+ "output_file" : output_file ,
769+ "created" : len (successful_containers ),
770+ "failed" : len (failed_containers ),
771+ "failed_containers" : [
772+ {"name" : name , "error" : error } for name , error in failed_containers
773+ ],
774+ }
775+ )
666776 )
667- )
668- click .echo (json .dumps ({"success" : True , "container_url " : container_url }))
777+ else :
778+ click .echo (json .dumps ({"success" : True , "output_file " : output_file , "created" : len ( successful_containers ) }))
669779 except KeyboardInterrupt :
670780 logger .warning ("Generator start interrupted by user" )
671781 click .echo (json .dumps ({"success" : False , "error" : "Interrupted by user" }))
@@ -800,21 +910,51 @@ def judge_cmd(
800910
801911@cli .command ("stop-pods" )
802912@click .option ("--targon-api-key" , required = True , help = "Targon API key." )
803- def stop_pods_cmd (targon_api_key : str ) -> None :
804- """Stop the generator, render and judge pods."""
913+ @click .option ("--file" , "csv_file" , default = "generators.csv" , help = "Path to CSV file with container names and URLs (default: generators.csv)" )
914+ def stop_pods_cmd (targon_api_key : str , csv_file : str ) -> None :
915+ """Stop generator containers listed in a CSV file, plus render and judge pods."""
805916 click .echo ("Stopping pods..." , err = True )
917+
918+ # Read container names from CSV file
919+ container_names_from_csv : list [str ] = []
920+ try :
921+ with Path (csv_file ).open ("r" , newline = "" ) as f :
922+ reader = csv .DictReader (f )
923+ for row in reader :
924+ container_name = row .get ("container_name" , "" ).strip ()
925+ if container_name :
926+ container_names_from_csv .append (container_name )
927+
928+ if container_names_from_csv :
929+ click .echo (f"Found { len (container_names_from_csv )} container(s) in { csv_file } " , err = True )
930+ except FileNotFoundError :
931+ click .echo (f"CSV file { csv_file } not found, will only stop render and judge pods" , err = True )
932+ except Exception as e :
933+ click .echo (f"Failed to read CSV file { csv_file } : { e } , will only stop render and judge pods" , err = True )
806934
807935 async def _stop () -> None :
808936 async with TargonClient (api_key = targon_api_key ) as targon :
809937 containers = await targon .list_containers ()
938+ # Create a mapping of container names to UIDs
939+ name_to_uid : dict [str , str ] = {c .name : c .uid for c in containers }
940+ stopped : list [str ] = []
941+ not_found : list [str ] = []
942+ # Stop containers from CSV file
943+ for container_name in container_names_from_csv :
944+ if container_name in name_to_uid :
945+ click .echo (f"Stopping container { container_name } ({ name_to_uid [container_name ]} )" , err = True )
946+ await targon .delete_container (name_to_uid [container_name ])
947+ stopped .append (container_name )
948+ else :
949+ click .echo (f"Container { container_name } not found" , err = True )
950+ not_found .append (container_name )
951+ # Also stop render and judge pods (legacy behavior)
810952 for c in containers :
811- # Stop all containers that start with "generator_", plus render and judge pods
812- if c .name .startswith ("generator_" ) or c .name in [
813- _RENDER_POD_NAME ,
814- _JUDGE_POD_NAME ,
815- ]:
816- click .echo (f"Stopping container { c .name } ({ c .uid } )" , err = True )
817- await targon .delete_container (c .uid )
953+ if c .name in [_RENDER_POD_NAME , _JUDGE_POD_NAME ]:
954+ if c .name not in stopped : # Avoid stopping twice if it was in CSV
955+ click .echo (f"Stopping container { c .name } ({ c .uid } )" , err = True )
956+ await targon .delete_container (c .uid )
957+ stopped .append (c .name )
818958
819959 try :
820960 asyncio .run (_stop ())
@@ -926,21 +1066,21 @@ async def _create_container(
9261066 echo = echo ,
9271067 )
9281068 if container :
929- echo (f"Container deployed successfully. UID: { container .uid } " )
930- echo (f"Container URL: { container .url } " )
1069+ echo (f"{ container_name } : Container deployed successfully. UID: { container .uid } " )
1070+ echo (f"{ container_name } : Container URL: { container .url } " )
9311071 url : str = str (container .url )
9321072 return url
9331073 else :
934- raise RuntimeError (" Failed to deploy and start container" )
1074+ raise RuntimeError (f" { container_name } : Failed to deploy and start container" )
9351075 except (KeyboardInterrupt , asyncio .CancelledError ):
936- echo (" \n Interrupted by user. Cleaning up..." )
1076+ echo (f" { container_name } : \n Interrupted by user. Cleaning up..." )
9371077 if container :
9381078 try :
9391079 async with TargonClient (api_key = targon_api_key ) as targon :
9401080 await targon .delete_container (container .uid )
941- echo (" Container deleted successfully" )
1081+ echo (f" { container_name } : Container deleted successfully" )
9421082 except Exception as cleanup_error :
943- echo (f"Error during cleanup: { cleanup_error } " )
1083+ echo (f"{ container_name } : Error during cleanup: { cleanup_error } " )
9441084 raise
9451085
9461086
0 commit comments