Skip to content

Commit 2fe14da

Browse files
committed
prime train logs: expose -c trainer / -c inference / -c env-server
Backend's /api/v1/rft/runs/{run_id}/logs now accepts component + env_name params (dedicated full-FT). Surface them through the CLI: prime train logs <run_id> -c trainer prime train logs <run_id> -c inference prime train logs <run_id> -c env-server --env <name> Legacy --env <name>/<idx> still routes through the env-server-logs endpoint (shared-RFT pods, cluster_id-backed lookup). Dedicated env-server (slug, no slash) goes through the unified /logs route. Per-rank --pod-index intentionally not exposed yet: the chart's torchrun --local-ranks-filter=0 already collapses in-pod rank fan-out to rank 0 stdout, and Loki's pod-label indexing in this tenant doesn't actually filter the prime-job-* streams — per-pod inspection on multi-node runs is kubectl + the PVC log files for now.
1 parent 7c43395 commit 2fe14da

2 files changed

Lines changed: 71 additions & 22 deletions

File tree

packages/prime/src/prime_cli/api/rl.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -384,12 +384,31 @@ def get_run(self, run_id: str) -> RLRun:
384384
raise APIError(f"Failed to get RL run: {e.response.text}")
385385
raise APIError(f"Failed to get RL run: {str(e)}")
386386

387-
def get_logs(self, run_id: str, tail_lines: int = 1000) -> str:
388-
"""Get orchestrator logs for an RL run."""
387+
def get_logs(
388+
self,
389+
run_id: str,
390+
tail_lines: int = 1000,
391+
component: Optional[str] = None,
392+
pod_index: int = 0,
393+
env_name: Optional[str] = None,
394+
) -> str:
395+
"""Get logs for one component of an RFT run.
396+
397+
Defaults to the orchestrator pod. Dedicated full-FT runs additionally
398+
expose `trainer`, `inference`, and `env-server` components.
399+
`pod_index` narrows to a specific replica for multi-node
400+
trainer/inference; `env_name` picks among per-env env-server
401+
StatefulSets when `component='env-server'`.
402+
"""
403+
params: Dict[str, Any] = {"tail_lines": tail_lines}
404+
if component:
405+
params["component"] = component
406+
if pod_index:
407+
params["pod_index"] = pod_index
408+
if env_name:
409+
params["env_name"] = env_name
389410
try:
390-
response = self.client.get(
391-
f"/rft/runs/{run_id}/logs", params={"tail_lines": tail_lines}
392-
)
411+
response = self.client.get(f"/rft/runs/{run_id}/logs", params=params)
393412
return response.get("logs", "")
394413
except Exception as e:
395414
if hasattr(e, "response") and hasattr(e.response, "text"):

packages/prime/src/prime_cli/commands/rl.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1856,8 +1856,9 @@ def get_logs(
18561856
"--component",
18571857
"-c",
18581858
help=(
1859-
"Pod to read logs from: 'orchestrator' (default) or 'env-server'. "
1860-
"Inferred from --env when omitted."
1859+
"Pod to read logs from: 'orchestrator' (default), 'trainer', "
1860+
"'inference', or 'env-server'. trainer/inference apply only "
1861+
"to dedicated full-FT runs. Inferred from --env when omitted."
18611862
),
18621863
),
18631864
env: Optional[str] = typer.Option(
@@ -1875,30 +1876,38 @@ def get_logs(
18751876
) -> None:
18761877
"""Get logs for a run.
18771878
1878-
Defaults to the orchestrator pod. Pass ``--env <name>`` to read an
1879-
env-server pod instead — useful when an env-server is crash-looping
1880-
(e.g. ``ModuleNotFoundError``) and the orchestrator has stalled at
1881-
"Starting orchestrator step 0".
1879+
Defaults to the orchestrator pod. Use ``--component`` to pick one of
1880+
``trainer`` / ``inference`` / ``env-server`` (dedicated full-FT only).
1881+
Pass ``--env <name>`` to read an env-server pod by name (shorthand for
1882+
``--component=env-server``).
18821883
18831884
List available pods first with ``prime train components <run_id>``.
18841885
1886+
Per-rank narrowing on multi-replica trainer/inference is not yet
1887+
surfaced here — `--local-ranks-filter=0` in the chart's torchrun
1888+
invocation already dedupes the in-pod rank fan-out, and per-pod
1889+
inspection on multi-node runs requires kubectl + the PVC log files.
1890+
18851891
Examples:
18861892
18871893
prime train logs <run_id>
18881894
prime train logs <run_id> -f
1895+
prime train logs <run_id> -c trainer
1896+
prime train logs <run_id> -c inference
18891897
prime train logs <run_id> --env reverse-text
18901898
prime train logs <run_id> --env reverse-text/1 -f
18911899
"""
1900+
valid_components = ("orchestrator", "trainer", "inference", "env-server")
18921901
if component is None:
18931902
component = "env-server" if env is not None else "orchestrator"
1894-
elif component not in ("orchestrator", "env-server"):
1903+
elif component not in valid_components:
18951904
raise typer.BadParameter(
1896-
f"Invalid component '{component}'. Use 'orchestrator' or 'env-server'.",
1905+
f"Invalid component '{component}'. Use one of: {', '.join(valid_components)}.",
18971906
param_hint="--component",
18981907
)
1899-
if component == "orchestrator" and env is not None:
1908+
if env is not None and component != "env-server":
19001909
raise typer.BadParameter(
1901-
"--env applies only to env-server logs. Drop --component=orchestrator or drop --env.",
1910+
f"--env applies only to env-server logs. Drop --component={component} or drop --env.",
19021911
param_hint="--env",
19031912
)
19041913
if component == "env-server" and env is None:
@@ -1912,25 +1921,46 @@ def get_logs(
19121921
api_client = APIClient()
19131922
rl_client = RLClient(api_client)
19141923

1915-
if component == "orchestrator":
1924+
if component == "env-server" and env is not None and "/" in env:
1925+
# Legacy shared-RFT env-server (`name/index` qualifier) — go
1926+
# through the dedicated env-server endpoint which uses the
1927+
# cluster_id-backed pod lookup path. Dedicated full-FT
1928+
# env-servers use the unified /logs route with
1929+
# component=env-server + env_name (StatefulSets always run
1930+
# one pod per env, so no index disambiguation needed).
1931+
env_name_q, env_index_q = _parse_env_qualifier(env)
1932+
1933+
def fetch(t: int) -> str:
1934+
return rl_client.get_env_server_logs(
1935+
run_id,
1936+
env_name=env_name_q,
1937+
env_index=env_index_q,
1938+
tail_lines=t,
1939+
)
1940+
1941+
label = f"env-server {env}"
1942+
elif component == "orchestrator":
19161943

19171944
def fetch(t: int) -> str:
19181945
return rl_client.get_logs(run_id, tail_lines=t)
19191946

19201947
label = "orchestrator"
19211948
else:
1922-
assert env is not None # narrowed by validation above
1923-
env_name, env_index = _parse_env_qualifier(env)
1949+
# trainer / inference / dedicated env-server — unified /logs
1950+
# route. env (no slash) names the dedicated env-server's
1951+
# StatefulSet.
1952+
fetch_component = component
1953+
fetch_env = env if component == "env-server" else None
19241954

19251955
def fetch(t: int) -> str:
1926-
return rl_client.get_env_server_logs(
1956+
return rl_client.get_logs(
19271957
run_id,
1928-
env_name=env_name,
1929-
env_index=env_index,
19301958
tail_lines=t,
1959+
component=fetch_component,
1960+
env_name=fetch_env,
19311961
)
19321962

1933-
label = f"env-server {env}"
1963+
label = f"env-server {env}" if component == "env-server" else component
19341964

19351965
_stream_logs(
19361966
fetch_fn=fetch,

0 commit comments

Comments
 (0)