Skip to content

Commit b30def4

Browse files
authored
Jwilber/lepton use shared secrets (#1450)
I recently learned you can use team-shared variables with Lepton: [link](https://docs.nvidia.com/dgx-cloud/lepton/features/workspace/secret/), so I created new shared variables. This allows anyone on our team to run the workflow locally, without having to change from my vars. I also added: - a debug script - flags to skip kratos/wandb - logic for single gpu resource To run locally: ``` python ci/lepton/core/launch_job.py --config-path="../model_convergence/configs" --config-name="recipes/debug" ``` --------- Signed-off-by: jwilber <jwilber@nvidia.com>
1 parent f01a9d2 commit b30def4

4 files changed

Lines changed: 161 additions & 16 deletions

File tree

ci/lepton/core/launch_job.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,20 +83,35 @@ def launch_single_job(client, cfg: DictConfig):
8383
full_cfg_json = json.dumps(OmegaConf.to_container(cfg, resolve=True))
8484
template_type = getattr(cfg, "template_type", "convergence_tests")
8585

86-
# trigger with logging to kratos, if enabled. Default to True.
87-
if getattr(cfg, "log_to_kratos", True):
88-
print("Logging to kratos")
86+
# Telemetry flags
87+
log_to_wandb = getattr(cfg, "log_to_wandb", True)
88+
log_to_kratos = getattr(cfg, "log_to_kratos", True)
89+
90+
print(f"Logging to wandb: {log_to_wandb}")
91+
print(f"Logging to kratos: {log_to_kratos}")
92+
93+
# Render script with kratos telemetry wrapper if enabled
94+
if log_to_kratos:
8995
rendered = render_launcher_string(cfg.script, full_cfg_json, template=template_type)
90-
else: # don't log to kratos
91-
print("Not logging to kratos")
96+
else:
9297
rendered = cfg.script
9398

9499
command = ["bash", "-c", rendered]
95100

96101
# env vars
97102
env_vars = []
103+
104+
# Env var names to skip based on telemetry flags
105+
kratos_env_vars = {"KRATOS_SSA_URL", "KRATOS_SSA_CLIENT_ID", "KRATOS_SSA_SECRET"}
106+
98107
if getattr(cfg, "environment_variables", None):
99108
for env_var in cfg.environment_variables:
109+
# Skip W&B env var if log_to_wandb is disabled
110+
if not log_to_wandb and env_var.name == "WANDB_API_KEY":
111+
continue
112+
# Skip Kratos env vars if log_to_kratos is disabled
113+
if not log_to_kratos and env_var.name in kratos_env_vars:
114+
continue
100115
env_vars.append(construct_env_var(env_var))
101116

102117
# mounts

ci/lepton/core/lepton_utils.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,18 @@ def validate_resource_shape(node_group: str, resource_shape: str) -> None:
6969
known_groups = ", ".join(sorted(resource_shapes_by_node_group.keys()))
7070
raise SystemExit(f"Unknown node group '{node_group}'.\nKnown node groups: {known_groups}")
7171

72-
# Extract GPU type from resource shape (e.g., "gpu.2xh100-sxm" -> "h100-sxm")
72+
# Extract GPU type from resource shape
73+
# Formats: "gpu.h100-sxm" (single), "gpu.2xh100-sxm" (multi)
7374
try:
74-
# Handle format like "gpu.2xh100-sxm" or "gpu.8xh200"
75-
gpu_part = resource_shape.split(".", 1)[1] # Get "2xh100-sxm"
76-
gpu_type = gpu_part.split("x", 1)[1] # Get "h100-sxm"
75+
gpu_part = resource_shape.split(".", 1)[1] # Get "h100-sxm" or "2xh100-sxm"
76+
# Check if format is "NxGPU_TYPE" (multi-GPU) or just "GPU_TYPE" (single GPU)
77+
if gpu_part[0].isdigit() and "x" in gpu_part:
78+
gpu_type = gpu_part.split("x", 1)[1] # Get "h100-sxm" from "2xh100-sxm"
79+
else:
80+
gpu_type = gpu_part # Single GPU: "h100-sxm"
7781
except (IndexError, ValueError):
7882
raise SystemExit(
79-
f"Invalid resource shape format: {resource_shape}. Expected format: gpu.NxGPU_TYPE or cpu.SIZE"
83+
f"Invalid resource shape format: {resource_shape}. Expected format: gpu.GPU_TYPE or gpu.NxGPU_TYPE"
8084
)
8185

8286
available_gpu_types = resource_shapes_by_node_group[node_group]

ci/lepton/model_convergence/configs/base.yaml

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,38 @@ container:
1919
# These keys must be present for the job to authenticate with
2020
# external services (W&B, Kratos, Lepton) and control runtime caching.
2121
# HF_HOME is optional but recommended to speed up Hugging Face model loading.
22+
#
23+
# For local development, you can disable telemetry and override secrets:
24+
# python ci/lepton/core/launch_job.py \
25+
# --config-path="../model_convergence/configs" \
26+
# --config-name="recipes/esm2_native_te" \
27+
# log_to_wandb=false \
28+
# log_to_kratos=false
2229
############################################################
30+
31+
# Telemetry flags (set to false to skip W&B/Kratos and avoid needing their secrets)
32+
log_to_wandb: true
33+
log_to_kratos: true
34+
35+
# Secret name overrides (users can point to their own Lepton secrets)
36+
wandb_secret: SHARED_WANDB_API_KEY # pragma: allowlist secret
37+
hf_token_secret: SHARED_HF_TOKEN # pragma: allowlist secret
38+
2339
environment_variables:
2440
- name: WANDB_API_KEY
25-
value_from: JWILBER_WANDB_API_KEY
41+
value_from: ${wandb_secret}
2642
- name: KRATOS_SSA_URL
27-
value_from: KRATOS_SSA_URL
43+
value_from: SHARED_KRATOS_SSA_URL
2844
- name: KRATOS_SSA_CLIENT_ID
29-
value_from: KRATOS_SSA_CLIENT_ID
45+
value_from: SHARED_KRATOS_SSA_CLIENT_ID
3046
- name: KRATOS_SSA_SECRET
31-
value_from: KRATOS_SSA_SECRET.jwilber
47+
value_from: SHARED_KRATOS_SSA_SECRET # pragma: allowlist secret
3248
- name: LEP_LOGIN_CREDENTIALS
33-
value_from: LEP_LOGIN_CREDENTIALS
49+
value_from: SHARED_LEP_LOGIN_CREDENTIALS
3450
- name: HF_HOME
3551
value: /data/esm2/cache
3652
- name: HF_TOKEN
37-
value_from: HUGGING_FACE_HUB_TOKEN.jwilber
53+
value_from: ${hf_token_secret}
3854

3955
############################################################
4056
# Lepton Cluster Selection & Node Group
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# @package _global_
2+
# Debug config for testing local development setup.
3+
# Usage:
4+
# python ci/lepton/core/launch_job.py \
5+
# --config-path="../model_convergence/configs" \
6+
# --config-name="recipes/debug" \
7+
# hf_token_secret=YOUR_HF_TOKEN_SECRET_NAME
8+
defaults:
9+
- /base
10+
- _self_
11+
12+
############################################################
13+
# Disable telemetry by default for local testing
14+
############################################################
15+
log_to_wandb: false
16+
log_to_kratos: false
17+
18+
############################################################
19+
# lepton job info - minimal single GPU setup
20+
############################################################
21+
node_group: yo-bom-lepton-001
22+
mount_from: node-nfs:fs1
23+
num_nodes: 1
24+
device_type: gpu
25+
num_devices: 1
26+
gpu_type: h100-sxm
27+
resource_shape: gpu.h100-sxm # Single GPU: gpu.h100-sxm, Multi: gpu.{2,4,8}xh100-sxm
28+
29+
############################################################
30+
# recipe identifiers
31+
############################################################
32+
recipe_subdir: esm2_native_te
33+
model_type: esm2
34+
variant: train
35+
36+
framework: native
37+
precision: bf16
38+
te_enabled: true
39+
fp8_enabled: false
40+
extras: []
41+
42+
############################################################
43+
# wandb info (disabled by default, but configured if enabled)
44+
############################################################
45+
total_gpus: ${multiply:${num_devices},${num_nodes}}
46+
47+
wandb_init_args:
48+
project: "debug__${sanitize:${branch}}"
49+
group: "debug"
50+
job_type: "${recipe_subdir}"
51+
name: null
52+
mode: "offline"
53+
54+
############################################################
55+
# task commands - minimal training run
56+
############################################################
57+
model_tag: nvidia/esm2_t33_650M_UR50D
58+
task_cmd: train_fsdp2
59+
config: L1_650M
60+
num_train_steps: 10
61+
micro_batch_size: 4
62+
load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data
63+
load_dataset_kwargs_streaming: true
64+
load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret
65+
num_workers: 1
66+
67+
num_warmup_steps: 2
68+
ckpt_dir: ""
69+
save_checkpoints: false
70+
save_final_model: false
71+
resume_from_checkpoint: false
72+
use_distributed_checkpoint_fsdp2: false
73+
74+
parallelism_strategy: fsdp2
75+
thd_enabled: false
76+
77+
############################################################
78+
# job name
79+
############################################################
80+
job_name: "debug-esm2-650m"
81+
wandb_name: "debug__${now:%Y%m%d-%H%M%S}"
82+
83+
############################################################
84+
# run script
85+
############################################################
86+
run_script: |
87+
HYDRA_FULL_ERROR=1 torchrun \
88+
--standalone \
89+
--nproc_per_node=1 \
90+
${task_cmd}.py \
91+
--config-name ${config}.yaml \
92+
wandb_init_args.mode=${wandb_init_args.mode} \
93+
wandb_init_args.project=${wandb_init_args.project} \
94+
+wandb_init_args.group=${wandb_init_args.group} \
95+
+wandb_init_args.job_type=${wandb_init_args.job_type} \
96+
wandb_init_args.name=${wandb_name} \
97+
num_train_steps=${num_train_steps} \
98+
dataset.micro_batch_size=${micro_batch_size} \
99+
use_sequence_packing=${thd_enabled} \
100+
dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \
101+
dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \
102+
+dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \
103+
dataset.num_workers=${num_workers} \
104+
lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \
105+
checkpoint.ckpt_dir=${ckpt_dir} \
106+
checkpoint.save_final_model=${save_final_model} \
107+
checkpoint.resume_from_checkpoint=${resume_from_checkpoint} \
108+
+checkpoint.save_checkpoints=${save_checkpoints} \
109+
+checkpoint.use_distributed_checkpoint_fsdp2=${use_distributed_checkpoint_fsdp2} \
110+
fp8_config.enabled=${fp8_enabled}

0 commit comments

Comments
 (0)