Skip to content

Commit 13e52e1

Browse files
committed
feat: create multiple generators at once. Remove all of them. Save info about created generators in a local file
1 parent 973bbb0 commit 13e52e1

4 files changed

Lines changed: 220 additions & 42 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,4 +210,5 @@ __marimo__/
210210
prompts.txt
211211
results/
212212
images*/
213-
duels.json
213+
duels.json
214+
generators.csv

README.md

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,26 +164,60 @@ Avoid these after submitting — they create new commits with different SHAs:
164164

165165
### Start Generator
166166

167-
The `start-generator` command deploys and starts a generator container on Targon. It deploys the container and outputs the container URL which can then be used with the `generate` command.
167+
The `start-generator` command deploys and starts one or more generator containers on Targon concurrently. It can create multiple containers at once and saves their names and URLs to a CSV file. The command displays tables showing successful and failed container deployments.
168168

169169
**Options:**
170170
- `--image-url` (required): URL of the Docker image to deploy
171171
- `--targon-api-key` (required): Targon API key for authentication
172+
- `--hf-token` (optional): HuggingFace token to pass as `HF_TOKEN` environment variable to containers
173+
- `--name` (optional): Custom container name prefix. Containers will be named `generator_{name}_{index}`. If not provided, uses default naming.
174+
- `--count` (optional, default: 1): Number of containers to create concurrently
175+
- `--output-file` (optional, default: "generators.csv"): Path to the CSV file where container names and URLs will be saved
172176

173177
**Example:**
174178
```bash
175179
404-cli start-generator \
176180
--image-url docker.io/username/model-generator:v1.0.0 \
177-
--targon-api-key your-targon-api-key-here
181+
--targon-api-key your-targon-api-key-here \
182+
--name my-model \
183+
--count 4 \
184+
--hf-token hf_xxxxxxxxxxxxx \
185+
--output-file my-generators.csv
178186
```
179187

188+
This will create 4 containers named `generator_my-model_0`, `generator_my-model_1`, `generator_my-model_2`, and `generator_my-model_3`.
189+
180190
**Output:**
181-
On success, outputs JSON with the container URL:
191+
The command displays two tables:
192+
1. **Successful Containers**: Shows ID, container name, and URL for each successfully deployed container
193+
2. **Failed Containers**: Shows ID, container name, and error message for any failed deployments
194+
195+
On success, outputs JSON:
196+
```json
197+
{"success": true, "output_file": "generators.csv", "created": 4}
198+
```
199+
200+
If some containers fail:
182201
```json
183-
{"success": true, "container_url": "https://generator-abc123.targon.io"}
202+
{
203+
"success": true,
204+
"output_file": "generators.csv",
205+
"created": 3,
206+
"failed": 1,
207+
"failed_containers": [
208+
{"name": "generator_my-model_2", "error": "Container deployment timeout"}
209+
]
210+
}
184211
```
185212

186-
The container URL is also displayed on stderr and should be used as the `--endpoint` parameter for the `generate` command.
213+
**CSV Output:**
214+
The CSV file contains two columns: `container_name` and `url`. Only successfully created containers are written to the CSV file.
215+
216+
**Notes:**
217+
- Containers are created concurrently for faster deployment
218+
- Container URLs can be used as the `--endpoint` parameter for the `generate` command
219+
- If some containers fail to deploy, successful ones are still saved to the CSV file
220+
- Progress messages and container status are displayed on stderr, while JSON results go to stdout
187221

188222
### Generate Models
189223

commit.py

Lines changed: 173 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
import csv
23
import json
34
import re
45
import sys
@@ -527,6 +528,59 @@ async def _list(round_number: int, schedule: Schedule, current_round: int, exclu
527528
# Always display as table
528529
_display_commitments_table(results, round_number)
529530

531+
532+
def _display_containers_table(
533+
successful_containers: list[tuple[str, str]], failed_containers: list[tuple[str, str]]
534+
) -> None:
535+
"""Display successful and failed containers in formatted tables."""
536+
# Display successful containers
537+
if successful_containers:
538+
click.echo(f"\n{'='*120}", err=True)
539+
click.echo(f"Successful Containers ({len(successful_containers)})", err=True)
540+
click.echo(f"{'='*120}", err=True)
541+
542+
# Calculate column widths
543+
max_name_len = max(len(name) for name, _ in successful_containers) if successful_containers else 0
544+
max_url_len = max(len(url) for _, url in successful_containers) if successful_containers else 0
545+
name_width = max(max_name_len, len("Container Name"))
546+
url_width = max(max_url_len, len("URL"))
547+
548+
# Table header
549+
header = f"{'ID':<4} {'Container Name':<{name_width}} {'URL':<{url_width}}"
550+
click.echo(header, err=True)
551+
click.echo("-" * 120, err=True)
552+
553+
# Table rows
554+
for idx, (name, url) in enumerate(successful_containers, 1):
555+
row = f"{idx:<4} {name:<{name_width}} {url:<{url_width}}"
556+
click.echo(row, err=True)
557+
558+
click.echo(f"{'='*120}\n", err=True)
559+
560+
# Display failed containers
561+
if failed_containers:
562+
click.echo(f"{'='*120}", err=True)
563+
click.echo(f"Failed Containers ({len(failed_containers)})", err=True)
564+
click.echo(f"{'='*120}", err=True)
565+
566+
# Calculate column widths
567+
max_name_len = max(len(name) for name, _ in failed_containers) if failed_containers else 0
568+
max_error_len = max(len(error) for _, error in failed_containers) if failed_containers else 0
569+
name_width = max(max_name_len, len("Container Name"))
570+
error_width = max(max_error_len, len("Error"))
571+
572+
# Table header
573+
header = f"{'ID':<4} {'Container Name':<{name_width}} {'Error':<{error_width}}"
574+
click.echo(header, err=True)
575+
click.echo("-" * 120, err=True)
576+
577+
# Table rows
578+
for idx, (name, error) in enumerate(failed_containers, 1):
579+
row = f"{idx:<4} {name:<{name_width}} {error:<{error_width}}"
580+
click.echo(row, err=True)
581+
582+
click.echo(f"{'='*120}\n", err=True)
583+
530584

531585
def _display_commitments_table(results: list[dict], round_number: int) -> None:
532586
"""Display commitments in a formatted table."""
@@ -638,7 +692,9 @@ def _parse_commitments(commitments: dict, round_number: int, schedule: Schedule,
638692
help="HuggingFace token to pass as HF_TOKEN environment variable",
639693
)
640694
@click.option("--name", "container_name", default=None, help="Custom container name (default: generator)")
641-
def start_generator_cmd(image_url: str, targon_api_key: str, hf_token: str | None, container_name: str | None) -> None:
695+
@click.option("--count", default=1, help="Number of models to generate (default: 1)")
696+
@click.option("--output-file", default="generators.csv", help="Path to the CSV file where container names and URLs will be saved (default: generators.csv)")
697+
def start_generator_cmd(image_url: str, targon_api_key: str, hf_token: str | None, container_name: str | None, count: float, output_file: str) -> None:
642698
"""Start the generator container."""
643699
click.echo(f"Starting generator: {image_url}", err=True)
644700

@@ -648,24 +704,78 @@ def start_generator_cmd(image_url: str, targon_api_key: str, hf_token: str | Non
648704
env = {"HF_TOKEN": hf_token}
649705

650706
# Format container name: "generator_{name}" if name provided, otherwise use default
651-
if container_name:
652-
name = f"generator_{container_name}"
653-
else:
654-
name = _GENERATOR_POD_NAME
655-
656-
container_url = asyncio.run(
657-
_create_container(
658-
image_url=image_url,
659-
container_name=name,
660-
targon_api_key=targon_api_key,
661-
resource_name="h200-small",
662-
port=_GENERATOR_PORT,
663-
health_check_path=_GENERATOR_HEALTH_CHECK_PATH,
664-
echo=lambda msg: click.echo(msg, err=True),
665-
env=env,
707+
container_names: list[str] = []
708+
for idx in range(int(count)):
709+
if container_name:
710+
name = f"generator_{container_name}_{idx}"
711+
else:
712+
name = f"{_GENERATOR_POD_NAME}_{idx}"
713+
container_names.append(name)
714+
715+
# Create async function to run tasks
716+
async def _create_all_containers() -> list[str | Exception]:
717+
"""Create all containers concurrently using tasks."""
718+
tasks: list[asyncio.Task[str]] = []
719+
for name in container_names:
720+
task = asyncio.create_task(
721+
_create_container(
722+
image_url=image_url,
723+
container_name=name,
724+
targon_api_key=targon_api_key,
725+
resource_name="h200-small",
726+
port=_GENERATOR_PORT,
727+
health_check_path=_GENERATOR_HEALTH_CHECK_PATH,
728+
echo=lambda msg: click.echo(msg, err=True),
729+
env=env,
730+
)
731+
)
732+
tasks.append(task)
733+
734+
# Gather results, allowing some tasks to fail
735+
return await asyncio.gather(*tasks, return_exceptions=True)
736+
737+
# Run the async function
738+
results = asyncio.run(_create_all_containers())
739+
740+
# Separate successful and failed containers
741+
successful_containers: list[tuple[str, str]] = []
742+
failed_containers: list[tuple[str, str]] = []
743+
744+
for name, result in zip(container_names, results):
745+
if isinstance(result, Exception):
746+
error_msg = str(result)
747+
failed_containers.append((name, error_msg))
748+
logger.error(f"Failed to create container {name}: {error_msg}")
749+
else:
750+
successful_containers.append((name, result))
751+
752+
# Display results in tables
753+
_display_containers_table(successful_containers, failed_containers)
754+
755+
# Write successful containers to CSV file
756+
with Path(output_file).open("w", newline="") as f:
757+
writer = csv.writer(f)
758+
writer.writerow(["container_name", "url"])
759+
for name, url in successful_containers:
760+
writer.writerow([name, url])
761+
762+
# Report results
763+
if failed_containers:
764+
click.echo(
765+
json.dumps(
766+
{
767+
"success": len(successful_containers) > 0,
768+
"output_file": output_file,
769+
"created": len(successful_containers),
770+
"failed": len(failed_containers),
771+
"failed_containers": [
772+
{"name": name, "error": error} for name, error in failed_containers
773+
],
774+
}
775+
)
666776
)
667-
)
668-
click.echo(json.dumps({"success": True, "container_url": container_url}))
777+
else:
778+
click.echo(json.dumps({"success": True, "output_file": output_file, "created": len(successful_containers)}))
669779
except KeyboardInterrupt:
670780
logger.warning("Generator start interrupted by user")
671781
click.echo(json.dumps({"success": False, "error": "Interrupted by user"}))
@@ -800,21 +910,51 @@ def judge_cmd(
800910

801911
@cli.command("stop-pods")
802912
@click.option("--targon-api-key", required=True, help="Targon API key.")
803-
def stop_pods_cmd(targon_api_key: str) -> None:
804-
"""Stop the generator, render and judge pods."""
913+
@click.option("--file", "csv_file", default="generators.csv", help="Path to CSV file with container names and URLs (default: generators.csv)")
914+
def stop_pods_cmd(targon_api_key: str, csv_file: str) -> None:
915+
"""Stop generator containers listed in a CSV file, plus render and judge pods."""
805916
click.echo("Stopping pods...", err=True)
917+
918+
# Read container names from CSV file
919+
container_names_from_csv: list[str] = []
920+
try:
921+
with Path(csv_file).open("r", newline="") as f:
922+
reader = csv.DictReader(f)
923+
for row in reader:
924+
container_name = row.get("container_name", "").strip()
925+
if container_name:
926+
container_names_from_csv.append(container_name)
927+
928+
if container_names_from_csv:
929+
click.echo(f"Found {len(container_names_from_csv)} container(s) in {csv_file}", err=True)
930+
except FileNotFoundError:
931+
click.echo(f"CSV file {csv_file} not found, will only stop render and judge pods", err=True)
932+
except Exception as e:
933+
click.echo(f"Failed to read CSV file {csv_file}: {e}, will only stop render and judge pods", err=True)
806934

807935
async def _stop() -> None:
808936
async with TargonClient(api_key=targon_api_key) as targon:
809937
containers = await targon.list_containers()
938+
# Create a mapping of container names to UIDs
939+
name_to_uid: dict[str, str] = {c.name: c.uid for c in containers}
940+
stopped: list[str] = []
941+
not_found: list[str] = []
942+
# Stop containers from CSV file
943+
for container_name in container_names_from_csv:
944+
if container_name in name_to_uid:
945+
click.echo(f"Stopping container {container_name} ({name_to_uid[container_name]})", err=True)
946+
await targon.delete_container(name_to_uid[container_name])
947+
stopped.append(container_name)
948+
else:
949+
click.echo(f"Container {container_name} not found", err=True)
950+
not_found.append(container_name)
951+
# Also stop render and judge pods (legacy behavior)
810952
for c in containers:
811-
# Stop all containers that start with "generator_", plus render and judge pods
812-
if c.name.startswith("generator_") or c.name in [
813-
_RENDER_POD_NAME,
814-
_JUDGE_POD_NAME,
815-
]:
816-
click.echo(f"Stopping container {c.name} ({c.uid})", err=True)
817-
await targon.delete_container(c.uid)
953+
if c.name in [_RENDER_POD_NAME, _JUDGE_POD_NAME]:
954+
if c.name not in stopped: # Avoid stopping twice if it was in CSV
955+
click.echo(f"Stopping container {c.name} ({c.uid})", err=True)
956+
await targon.delete_container(c.uid)
957+
stopped.append(c.name)
818958

819959
try:
820960
asyncio.run(_stop())
@@ -926,21 +1066,21 @@ async def _create_container(
9261066
echo=echo,
9271067
)
9281068
if container:
929-
echo(f"Container deployed successfully. UID: {container.uid}")
930-
echo(f"Container URL: {container.url}")
1069+
echo(f"{container_name}: Container deployed successfully. UID: {container.uid}")
1070+
echo(f"{container_name}: Container URL: {container.url}")
9311071
url: str = str(container.url)
9321072
return url
9331073
else:
934-
raise RuntimeError("Failed to deploy and start container")
1074+
raise RuntimeError(f"{container_name}: Failed to deploy and start container")
9351075
except (KeyboardInterrupt, asyncio.CancelledError):
936-
echo("\nInterrupted by user. Cleaning up...")
1076+
echo(f"{container_name}: \nInterrupted by user. Cleaning up...")
9371077
if container:
9381078
try:
9391079
async with TargonClient(api_key=targon_api_key) as targon:
9401080
await targon.delete_container(container.uid)
941-
echo("Container deleted successfully")
1081+
echo(f"{container_name}: Container deleted successfully")
9421082
except Exception as cleanup_error:
943-
echo(f"Error during cleanup: {cleanup_error}")
1083+
echo(f"{container_name}: Error during cleanup: {cleanup_error}")
9441084
raise
9451085

9461086

targon_utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ async def wait_for_healthy(
5353
timeout: float, # noqa: ASYNC109
5454
check_interval: float = 5.0,
5555
health_check_path: str = "/health",
56+
name: str | None = None,
5657
echo: Callable[[str], None] | None = None,
5758
) -> bool:
5859
"""Wait for the container health endpoint to return 200 (stage 2)."""
@@ -62,6 +63,7 @@ async def wait_for_healthy(
6263
else:
6364
health_url = f"{url}{health_check_path}"
6465

66+
container_label = f"{name}: " if name else ""
6567
async with httpx.AsyncClient(timeout=30.0) as http:
6668
start = asyncio.get_running_loop().time()
6769
time_elapsed = asyncio.get_running_loop().time() - start
@@ -70,13 +72,13 @@ async def wait_for_healthy(
7072
response = await http.get(health_url)
7173
response.raise_for_status()
7274
if response.status_code == 200:
73-
_log(f"Container at {url} healthy", echo, "info")
75+
_log(f"{container_label}Container at {url} healthy", echo, "info")
7476
return True
7577
except Exception:
76-
_log(f"Container not ready yet: {time_elapsed:.1f}/{timeout:.1f}s", echo, "info")
78+
_log(f"{container_label}Container not ready yet: {time_elapsed:.1f}/{timeout:.1f}s", echo, "info")
7779
await asyncio.sleep(check_interval)
7880
time_elapsed = asyncio.get_running_loop().time() - start
79-
_log(f"Container at {url} not healthy within {timeout}s. Timeout reached.", echo, "error")
81+
_log(f"{container_label}Container at {url} not healthy within {timeout}s. Timeout reached.", echo, "error")
8082
return False
8183

8284

@@ -132,6 +134,7 @@ async def ensure_running_container(
132134
timeout=warmup_timeout,
133135
check_interval=check_interval,
134136
health_check_path=health_check_path,
137+
name=name,
135138
echo=echo,
136139
):
137140
_log("Container failed health check, deleting", echo, "error")

0 commit comments

Comments
 (0)