prime train logs: expose -c trainer / -c inference / -c env-server

JannikSt · JannikSt · commit feea5701bd80 · 2026-06-01T15:02:18.000+02:00
Backend's /api/v1/rft/runs/{run_id}/logs now accepts component +
env_name params (dedicated full-FT). Surface them through the CLI:

  prime train logs &lt;run_id&gt; -c trainer
  prime train logs &lt;run_id&gt; -c inference
  prime train logs &lt;run_id&gt; -c env-server --env &lt;name&gt;

Legacy --env &lt;name&gt;/&lt;idx&gt; still routes through the env-server-logs
endpoint (shared-RFT pods, cluster_id-backed lookup). Dedicated
env-server (slug, no slash) goes through the unified /logs route.

Per-rank --pod-index intentionally not exposed yet: the chart's
torchrun --local-ranks-filter=0 already collapses in-pod rank fan-out
to rank 0 stdout, and Loki's pod-label indexing in this tenant
doesn't actually filter the prime-job-* streams — per-pod inspection
on multi-node runs is kubectl + the PVC log files for now.
diff --git a/packages/prime/src/prime_cli/api/rl.py b/packages/prime/src/prime_cli/api/rl.py
@@ -74,18 +74,18 @@ class RLRun(BaseModel):
     kind: Optional[str] = Field(None, description="Run kind discriminator")
 
     # Training configuration
-    rollouts_per_example: int = Field(..., alias="rolloutsPerExample")
-    seq_len: int = Field(..., alias="seqLen")
-    max_steps: int = Field(..., alias="maxSteps")
+    rollouts_per_example: Optional[int] = Field(None, alias="rolloutsPerExample")
+    seq_len: Optional[int] = Field(None, alias="seqLen")
+    max_steps: Optional[int] = Field(None, alias="maxSteps")
     max_tokens: Optional[int] = Field(None, alias="maxTokens")
-    batch_size: int = Field(..., alias="batchSize")
+    batch_size: Optional[int] = Field(None, alias="batchSize")
     loss: Optional[str] = "rl"
     teacher: Optional[Dict[str, Any]] = Field(
         None,
         validation_alias=AliasChoices("teacher", "teacherConfig"),
         serialization_alias="teacher",
     )
-    base_model: str = Field(..., alias="baseModel")
+    base_model: Optional[str] = Field(None, alias="baseModel")
     environments: List[Dict[str, Any]] = Field(default_factory=list)
     run_config: Optional[Dict[str, Any]] = Field(None, alias="runConfig")
     eval_config: Optional[Dict[str, Any]] = Field(None, alias="evalConfig")
@@ -423,16 +423,25 @@ def get_logs(
         regex: bool = False,
         level: Optional[str] = None,
         since_seconds: Optional[int] = None,
+        component: Optional[str] = None,
+        pod_index: int = 0,
+        env_name: Optional[str] = None,
     ) -> str:
-        """Get orchestrator logs for a Hosted Training run.
+        """Get logs for one component of a Hosted Training run.
+
+        Defaults to the orchestrator pod. Dedicated full-FT runs additionally
+        expose `trainer`, `inference`, and `env-server` components.
+        `pod_index` narrows to a specific replica for multi-node
+        trainer/inference; `env_name` picks among per-env env-server
+        StatefulSets when `component='env-server'`.
 
         Optional filters narrow the result via the platform's log search
         backend:
           - search: substring (or regex if regex=True) line filter
           - level:  one of ERROR/WARNING/SUCCESS/INFO/DEBUG
-          - since_seconds: how far back to look (60–86400)
+          - since_seconds: how far back to look (60-86400)
         """
-        params: Dict[str, object] = {"tail_lines": tail_lines}
+        params: Dict[str, Any] = {"tail_lines": tail_lines}
         if search:
             params["search"] = search
         if regex:
@@ -441,6 +450,12 @@ def get_logs(
             params["level"] = level
         if since_seconds is not None:
             params["since_seconds"] = since_seconds
+        if component:
+            params["component"] = component
+        if pod_index:
+            params["pod_index"] = pod_index
+        if env_name:
+            params["env_name"] = env_name
         try:
             response = self.client.get(f"/rft/runs/{run_id}/logs", params=params)
             return response.get("logs", "")
diff --git a/packages/prime/src/prime_cli/commands/rl.py b/packages/prime/src/prime_cli/commands/rl.py
@@ -916,10 +916,10 @@ def _format_run_for_display(run: RLRun) -> Dict[str, Any]:
     return {
         "id": run.id,
         "status": run.status,
-        "model": run.base_model,
+        "model": run.base_model or "-",
         "environments": envs_display,
-        "steps": f"{run.max_steps}",
-        "rollouts": str(run.rollouts_per_example),
+        "steps": "-" if run.max_steps is None else f"{run.max_steps}",
+        "rollouts": "-" if run.rollouts_per_example is None else str(run.rollouts_per_example),
         "created_at": created_at,
         "team_id": run.team_id,
     }
@@ -1776,11 +1776,12 @@ def get_run(
         if run.status == "QUEUED" and run.runs_ahead is not None:
             status_text += f" (~{run.runs_ahead} runs ahead)"
         console.print(f"  Status: [{status_color}]{status_text}[/{status_color}]")
-        console.print(f"  Model: [magenta]{run.base_model}[/magenta]")
+        console.print(f"  Model: [magenta]{formatted['model']}[/magenta]")
         console.print(f"  Environments: [green]{formatted['environments']}[/green]")
-        console.print(f"  Max Steps: {run.max_steps}")
-        console.print(f"  Batch Size: {run.batch_size}")
-        console.print(f"  Rollouts per Example: {run.rollouts_per_example}")
+        console.print(f"  Max Steps: {formatted['steps']}")
+        batch_size = "-" if run.batch_size is None else str(run.batch_size)
+        console.print(f"  Batch Size: {batch_size}")
+        console.print(f"  Rollouts per Example: {formatted['rollouts']}")
         if run.max_tokens:
             console.print(f"  Max Tokens: {run.max_tokens}")
         if run.wandb_project:
@@ -1846,12 +1847,9 @@ def delete_run(
     # Try the hosted full-FT delete endpoint first. The backend's kind
     # gate 404s for non-DEDICATED_FULL_FT runs, so a 404 here means
     # "not a hosted run" and we fall back to the LoRA-shared path.
-    # This avoids the prior approach of pre-fetching via rl_client.get_run
-    # for the discriminator — which fails for DEDICATED_FULL_FT runs
-    # whose row doesn't carry the LoRA-required RLRun fields
-    # (rollouts_per_example, seq_len, max_steps, batch_size, base_model).
-    # Pydantic ValidationError on those would mask the actual run kind
-    # and silently route to the wrong endpoint.
+    # This avoids relying on list/get discriminator shape before delete:
+    # the delete endpoint owns the run-kind decision, and the CLI only
+    # falls back when that endpoint says the row is not dedicated full-FT.
     from ..api.training import HostedTrainingClient
 
     rl_client = RLClient(api_client)
@@ -2078,6 +2076,14 @@ def _parse_env_qualifier(env: str) -> tuple[str, int]:
     return env, 0
 
 
+def _parse_env_qualifier_with_index(env: str) -> tuple[str, int, bool]:
+    """Parse an env qualifier and report whether a numeric suffix was present."""
+    name, sep, idx_str = env.rpartition("/")
+    if sep and name and idx_str.isdigit():
+        return name, int(idx_str), True
+    return env, 0, False
+
+
 @app.command("logs", rich_help_panel="Monitoring")
 def get_logs(
     run_id: str = typer.Argument(..., help="Run ID to get logs for"),
@@ -2086,8 +2092,9 @@ def get_logs(
         "--component",
         "-c",
         help=(
-            "Pod to read logs from: 'orchestrator' (default) or 'env-server'. "
-            "Inferred from --env when omitted."
+            "Pod to read logs from: 'orchestrator' (default), 'trainer', "
+            "'inference', or 'env-server'. trainer/inference apply only "
+            "to dedicated full-FT runs. Inferred from --env when omitted."
         ),
     ),
     env: Optional[str] = typer.Option(
@@ -2132,33 +2139,41 @@ def get_logs(
 ) -> None:
     """Get logs for a run.
 
-    Defaults to the orchestrator pod. Pass ``--env <name>`` to read an
-    env-server pod instead — useful when an env-server is crash-looping
-    (e.g. ``ModuleNotFoundError``) and the orchestrator has stalled at
-    "Starting orchestrator step 0".
+    Defaults to the orchestrator pod. Use ``--component`` to pick one of
+    ``trainer`` / ``inference`` / ``env-server`` (dedicated full-FT only).
+    Pass ``--env <name>`` to read an env-server pod by name (shorthand for
+    ``--component=env-server``).
 
     List available pods first with ``prime train components <run_id>``.
 
+    Per-rank narrowing on multi-replica trainer/inference is not yet
+    surfaced here — `--local-ranks-filter=0` in the chart's torchrun
+    invocation already dedupes the in-pod rank fan-out, and per-pod
+    inspection on multi-node runs requires kubectl + the PVC log files.
+
     Examples:
 
         prime train logs <run_id>
         prime train logs <run_id> -f
         prime train logs <run_id> --search Backpressure
         prime train logs <run_id> --level ERROR --since 1h
         prime train logs <run_id> --search 'Step \\d+' --regex
+        prime train logs <run_id> -c trainer
+        prime train logs <run_id> -c inference
         prime train logs <run_id> --env reverse-text
         prime train logs <run_id> --env reverse-text/1 -f
     """
+    valid_components = ("orchestrator", "trainer", "inference", "env-server")
     if component is None:
         component = "env-server" if env is not None else "orchestrator"
-    elif component not in ("orchestrator", "env-server"):
+    elif component not in valid_components:
         raise typer.BadParameter(
-            f"Invalid component '{component}'. Use 'orchestrator' or 'env-server'.",
+            f"Invalid component '{component}'. Use one of: {', '.join(valid_components)}.",
             param_hint="--component",
         )
-    if component == "orchestrator" and env is not None:
+    if env is not None and component != "env-server":
         raise typer.BadParameter(
-            "--env applies only to env-server logs. Drop --component=orchestrator or drop --env.",
+            f"--env applies only to env-server logs. Drop --component={component} or drop --env.",
             param_hint="--env",
         )
     if component == "env-server" and env is None:
@@ -2189,7 +2204,33 @@ def get_logs(
         api_client = APIClient()
         rl_client = RLClient(api_client)
 
-        if component == "orchestrator":
+        env_name_q, env_index_q, env_has_index_q = (
+            _parse_env_qualifier_with_index(env) if env is not None else (None, 0, False)
+        )
+
+        if component == "env-server" and env is not None and env_has_index_q:
+            assert env_name_q is not None
+            # Legacy shared-RFT env-server (`name/index` qualifier) — go
+            # through the dedicated env-server endpoint which uses the
+            # cluster_id-backed pod lookup path. Dedicated full-FT
+            # env-servers use the unified /logs route with
+            # component=env-server + env_name (StatefulSets always run
+            # one pod per env, so no index disambiguation needed).
+
+            def fetch(t: int) -> str:
+                return rl_client.get_env_server_logs(
+                    run_id,
+                    env_name=env_name_q,
+                    env_index=env_index_q,
+                    tail_lines=t,
+                    search=search,
+                    regex=regex,
+                    level=normalized_level,
+                    since_seconds=since_seconds,
+                )
+
+            label = f"env-server {env}"
+        elif component == "orchestrator":
 
             def fetch(t: int) -> str:
                 return rl_client.get_logs(
@@ -2203,22 +2244,25 @@ def fetch(t: int) -> str:
 
             label = "orchestrator"
         else:
-            assert env is not None  # narrowed by validation above
-            env_name, env_index = _parse_env_qualifier(env)
+            # trainer / inference / dedicated env-server — unified /logs
+            # route. env (no slash) names the dedicated env-server's
+            # StatefulSet.
+            fetch_component = component
+            fetch_env = env if component == "env-server" else None
 
             def fetch(t: int) -> str:
-                return rl_client.get_env_server_logs(
+                return rl_client.get_logs(
                     run_id,
-                    env_name=env_name,
-                    env_index=env_index,
                     tail_lines=t,
                     search=search,
                     regex=regex,
                     level=normalized_level,
                     since_seconds=since_seconds,
+                    component=fetch_component,
+                    env_name=fetch_env,
                 )
 
-            label = f"env-server {env}"
+            label = f"env-server {env}" if component == "env-server" else component
 
         _stream_logs(
             fetch_fn=fetch,
diff --git a/packages/prime/tests/test_rl_api.py b/packages/prime/tests/test_rl_api.py
@@ -2,7 +2,7 @@
 
 from typing import Any
 
-from prime_cli.api.rl import RLClient
+from prime_cli.api.rl import RLClient, RLRun
 
 
 class FakeAPIClient:
@@ -46,6 +46,27 @@ def post(self, endpoint: str, json: dict[str, Any] | None = None) -> dict[str, A
         }
 
 
+def test_run_model_allows_dedicated_full_ft_without_lora_fields() -> None:
+    run = RLRun.model_validate(
+        {
+            "id": "full-ft-run",
+            "name": "dedicated",
+            "userId": "user-1",
+            "status": "RUNNING",
+            "kind": "DEDICATED_FULL_FT",
+            "createdAt": "2026-05-17T00:00:00Z",
+            "updatedAt": "2026-05-17T00:00:00Z",
+        }
+    )
+
+    assert run.kind == "DEDICATED_FULL_FT"
+    assert run.rollouts_per_example is None
+    assert run.seq_len is None
+    assert run.max_steps is None
+    assert run.batch_size is None
+    assert run.base_model is None
+
+
 def test_get_distributions_preserves_chart_histogram_data() -> None:
     api_client = FakeAPIClient()
     client = RLClient(api_client)  # type: ignore[arg-type]
diff --git a/packages/prime/tests/test_rl_logs.py b/packages/prime/tests/test_rl_logs.py