Jwilber/lepton use shared secrets (#1450)

jwilber · web-flow · commit b30def4696d5 · 2026-02-05T20:32:47.000Z
I recently learned you can use team-shared variables with Lepton: [link](https://docs.nvidia.com/dgx-cloud/lepton/features/workspace/secret/), so I created new shared variables. This allows anyone on our team to run the workflow locally, without having to change from my vars. I also added: - a debug script - flags to skip kratos/wandb - logic for single gpu resource To run locally: ``` python ci/lepton/core/launch_job.py --config-path="../model_convergence/configs" --config-name="recipes/debug" ``` --------- Signed-off-by: jwilber <jwilber@nvidia.com>
diff --git a/ci/lepton/core/launch_job.py b/ci/lepton/core/launch_job.py
@@ -83,20 +83,35 @@ def launch_single_job(client, cfg: DictConfig):
     full_cfg_json = json.dumps(OmegaConf.to_container(cfg, resolve=True))
     template_type = getattr(cfg, "template_type", "convergence_tests")
 
-    # trigger with logging to kratos, if enabled. Default to True.
-    if getattr(cfg, "log_to_kratos", True):
-        print("Logging to kratos")
+    # Telemetry flags
+    log_to_wandb = getattr(cfg, "log_to_wandb", True)
+    log_to_kratos = getattr(cfg, "log_to_kratos", True)
+
+    print(f"Logging to wandb: {log_to_wandb}")
+    print(f"Logging to kratos: {log_to_kratos}")
+
+    # Render script with kratos telemetry wrapper if enabled
+    if log_to_kratos:
         rendered = render_launcher_string(cfg.script, full_cfg_json, template=template_type)
-    else:  # don't log to kratos
-        print("Not logging to kratos")
+    else:
         rendered = cfg.script
 
     command = ["bash", "-c", rendered]
 
     # env vars
     env_vars = []
+
+    # Env var names to skip based on telemetry flags
+    kratos_env_vars = {"KRATOS_SSA_URL", "KRATOS_SSA_CLIENT_ID", "KRATOS_SSA_SECRET"}
+
     if getattr(cfg, "environment_variables", None):
         for env_var in cfg.environment_variables:
+            # Skip W&B env var if log_to_wandb is disabled
+            if not log_to_wandb and env_var.name == "WANDB_API_KEY":
+                continue
+            # Skip Kratos env vars if log_to_kratos is disabled
+            if not log_to_kratos and env_var.name in kratos_env_vars:
+                continue
             env_vars.append(construct_env_var(env_var))
 
     # mounts
diff --git a/ci/lepton/core/lepton_utils.py b/ci/lepton/core/lepton_utils.py
@@ -69,14 +69,18 @@ def validate_resource_shape(node_group: str, resource_shape: str) -> None:
         known_groups = ", ".join(sorted(resource_shapes_by_node_group.keys()))
         raise SystemExit(f"Unknown node group '{node_group}'.\nKnown node groups: {known_groups}")
 
-    # Extract GPU type from resource shape (e.g., "gpu.2xh100-sxm" -> "h100-sxm")
+    # Extract GPU type from resource shape
+    # Formats: "gpu.h100-sxm" (single), "gpu.2xh100-sxm" (multi)
     try:
-        # Handle format like "gpu.2xh100-sxm" or "gpu.8xh200"
-        gpu_part = resource_shape.split(".", 1)[1]  # Get "2xh100-sxm"
-        gpu_type = gpu_part.split("x", 1)[1]  # Get "h100-sxm"
+        gpu_part = resource_shape.split(".", 1)[1]  # Get "h100-sxm" or "2xh100-sxm"
+        # Check if format is "NxGPU_TYPE" (multi-GPU) or just "GPU_TYPE" (single GPU)
+        if gpu_part[0].isdigit() and "x" in gpu_part:
+            gpu_type = gpu_part.split("x", 1)[1]  # Get "h100-sxm" from "2xh100-sxm"
+        else:
+            gpu_type = gpu_part  # Single GPU: "h100-sxm"
     except (IndexError, ValueError):
         raise SystemExit(
-            f"Invalid resource shape format: {resource_shape}. Expected format: gpu.NxGPU_TYPE or cpu.SIZE"
+            f"Invalid resource shape format: {resource_shape}. Expected format: gpu.GPU_TYPE or gpu.NxGPU_TYPE"
         )
 
     available_gpu_types = resource_shapes_by_node_group[node_group]
diff --git a/ci/lepton/model_convergence/configs/base.yaml b/ci/lepton/model_convergence/configs/base.yaml
@@ -19,22 +19,38 @@ container:
 # These keys must be present for the job to authenticate with
 # external services (W&B, Kratos, Lepton) and control runtime caching.
 # HF_HOME is optional but recommended to speed up Hugging Face model loading.
+#
+# For local development, you can disable telemetry and override secrets:
+#   python ci/lepton/core/launch_job.py \
+#     --config-path="../model_convergence/configs" \
+#     --config-name="recipes/esm2_native_te" \
+#     log_to_wandb=false \
+#     log_to_kratos=false
 ############################################################
+
+# Telemetry flags (set to false to skip W&B/Kratos and avoid needing their secrets)
+log_to_wandb: true
+log_to_kratos: true
+
+# Secret name overrides (users can point to their own Lepton secrets)
+wandb_secret: SHARED_WANDB_API_KEY  # pragma: allowlist secret
+hf_token_secret: SHARED_HF_TOKEN  # pragma: allowlist secret
+
 environment_variables:
   - name: WANDB_API_KEY
-    value_from: JWILBER_WANDB_API_KEY
+    value_from: ${wandb_secret}
   - name: KRATOS_SSA_URL
-    value_from: KRATOS_SSA_URL
+    value_from: SHARED_KRATOS_SSA_URL
   - name: KRATOS_SSA_CLIENT_ID
-    value_from: KRATOS_SSA_CLIENT_ID
+    value_from: SHARED_KRATOS_SSA_CLIENT_ID
   - name: KRATOS_SSA_SECRET
-    value_from: KRATOS_SSA_SECRET.jwilber
+    value_from: SHARED_KRATOS_SSA_SECRET  # pragma: allowlist secret
   - name: LEP_LOGIN_CREDENTIALS
-    value_from: LEP_LOGIN_CREDENTIALS
+    value_from: SHARED_LEP_LOGIN_CREDENTIALS
   - name: HF_HOME
     value: /data/esm2/cache
   - name: HF_TOKEN
-    value_from: HUGGING_FACE_HUB_TOKEN.jwilber
+    value_from: ${hf_token_secret}
 
 ############################################################
 # Lepton Cluster Selection & Node Group
diff --git a/ci/lepton/model_convergence/configs/recipes/debug.yaml b/ci/lepton/model_convergence/configs/recipes/debug.yaml
@@ -0,0 +1,110 @@
+# @package _global_
+# Debug config for testing local development setup.
+# Usage:
+#   python ci/lepton/core/launch_job.py \
+#     --config-path="../model_convergence/configs" \
+#     --config-name="recipes/debug" \
+#     hf_token_secret=YOUR_HF_TOKEN_SECRET_NAME
+defaults:
+  - /base
+  - _self_
+
+############################################################
+# Disable telemetry by default for local testing
+############################################################
+log_to_wandb: false
+log_to_kratos: false
+
+############################################################
+# lepton job info - minimal single GPU setup
+############################################################
+node_group: yo-bom-lepton-001
+mount_from: node-nfs:fs1
+num_nodes: 1
+device_type: gpu
+num_devices: 1
+gpu_type: h100-sxm
+resource_shape: gpu.h100-sxm  # Single GPU: gpu.h100-sxm, Multi: gpu.{2,4,8}xh100-sxm
+
+############################################################
+# recipe identifiers
+############################################################
+recipe_subdir: esm2_native_te
+model_type: esm2
+variant: train
+
+framework: native
+precision: bf16
+te_enabled: true
+fp8_enabled: false
+extras: []
+
+############################################################
+# wandb info (disabled by default, but configured if enabled)
+############################################################
+total_gpus: ${multiply:${num_devices},${num_nodes}}
+
+wandb_init_args:
+  project: "debug__${sanitize:${branch}}"
+  group: "debug"
+  job_type: "${recipe_subdir}"
+  name: null
+  mode: "offline"
+
+############################################################
+# task commands - minimal training run
+############################################################
+model_tag: nvidia/esm2_t33_650M_UR50D
+task_cmd: train_fsdp2
+config: L1_650M
+num_train_steps: 10
+micro_batch_size: 4
+load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data
+load_dataset_kwargs_streaming: true
+load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995  # pragma: allowlist secret
+num_workers: 1
+
+num_warmup_steps: 2
+ckpt_dir: ""
+save_checkpoints: false
+save_final_model: false
+resume_from_checkpoint: false
+use_distributed_checkpoint_fsdp2: false
+
+parallelism_strategy: fsdp2
+thd_enabled: false
+
+############################################################
+# job name
+############################################################
+job_name: "debug-esm2-650m"
+wandb_name: "debug__${now:%Y%m%d-%H%M%S}"
+
+############################################################
+# run script
+############################################################
+run_script: |
+  HYDRA_FULL_ERROR=1 torchrun \
+    --standalone \
+    --nproc_per_node=1 \
+    ${task_cmd}.py \
+    --config-name ${config}.yaml \
+    wandb_init_args.mode=${wandb_init_args.mode} \
+    wandb_init_args.project=${wandb_init_args.project} \
+    +wandb_init_args.group=${wandb_init_args.group} \
+    +wandb_init_args.job_type=${wandb_init_args.job_type} \
+    wandb_init_args.name=${wandb_name} \
+    num_train_steps=${num_train_steps} \
+    dataset.micro_batch_size=${micro_batch_size} \
+    use_sequence_packing=${thd_enabled} \
+    dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \
+    dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \
+    +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \
+    dataset.num_workers=${num_workers} \
+    lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \
+    checkpoint.ckpt_dir=${ckpt_dir} \
+    checkpoint.save_final_model=${save_final_model} \
+    checkpoint.resume_from_checkpoint=${resume_from_checkpoint} \
+    +checkpoint.save_checkpoints=${save_checkpoints} \
+    +checkpoint.use_distributed_checkpoint_fsdp2=${use_distributed_checkpoint_fsdp2} \
+    fp8_config.enabled=${fp8_enabled}