Emerge-Lab · eugenevinitsky · Jun 27, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
diff --git a/scripts/cluster_configs/nightly_best.yaml b/scripts/cluster_configs/nightly_best.yaml
@@ -0,0 +1,135 @@
+# Multi-agent "best launch" nightly training program config.
+# Derived from the oignons2 (emerge/temp_training) configuration at:
+#   weights/oignons2/config.yaml
+# Adapted to NYU Greene cluster paths and resource shape. Multi-agent gigaflow
+# training over the 8 local CARLA maps with the oignons2 policy architecture,
+# reward shaping (conditioning + randomization on), and partner-blindness /
+# phantom-braking perturbations enabled. Keys here override
+# pufferlib/config/ocean/drive.ini.
+#
+# Launch via scripts/launch_nightly_best.sh (3 seeds, date-stamped).
+
+# Environment — multi-agent gigaflow over all 8 local CARLA towns
+env.simulation_mode: gigaflow
+env.map_dir: pufferlib/resources/drive/binaries/carla
+env.num_maps: 8
+env.num_agents: 720000
+env.min_agents_per_env: 1
+env.max_agents_per_env: 150
+env.use_map_cache: 1
+env.scenario_length: 1200
+# 0 disables periodic scenario resampling — every sub-env keeps the same map
+# for the full run instead of swapping every 38400 steps.
+env.resample_frequency: 0
+env.termination_mode: 1
+env.inactive_agent_threshold: 0.4
+env.dynamics_model: jerk
+env.target_type: static
+env.spawn_initial_speed: 0.0
+env.dt: 0.3
+env.traffic_light_behavior: 1
+env.collision_behavior: 1
+env.offroad_behavior: 1
+
+# Goal setup — three sequential waypoints, route-based placement [20, 60m]
+env.num_target_waypoints: 3
+env.min_waypoint_spacing: 20.0
+env.max_waypoint_spacing: 60.0
+env.goal_radius: 2.0
+env.goal_speed: 3.0
+
+# Observation shaping (matches oignons2)
+env.obs_slots_lane_n: 80
+env.obs_slots_boundary_n: 80
+env.obs_slots_partners_n: 16
+env.obs_slots_traffic_controls_n: 4
+env.obs_range_partner_m: 200.0
+env.obs_range_road_front_m: 200.0
+env.obs_range_road_behind_m: 40.0
+env.obs_range_road_side_m: 50.0
+env.obs_range_traffic_control_m: 100.0
+env.obs_norm_xy_offset_m: 200.0
+env.obs_norm_goal_offset_m: 200.0
+env.obs_norm_road_seg_length_m: 10.0
+env.obs_norm_road_seg_width_m: 5.0
+env.obs_norm_veh_length_m: 15.0
+env.obs_norm_veh_width_m: 10.0
+env.obs_dropout_lane: 0.5
+env.obs_dropout_boundary: 0.4
+
+# Perturbations (on during training; eval's clean macro zeros these)
+env.partner_blindness_prob: 0.03
+env.partner_blindness_trigger_prob: 0.05
+env.phantom_braking_prob: 0.02
+env.phantom_braking_trigger_prob: 0.02
+env.phantom_braking_duration: 10
+
+# Reward shaping (oignons2 weights + conditioning/randomization on)
+env.reward_conditioning: true
+env.reward_randomization: true
+env.reward_goal: 1.0
+env.reward_collision: 1.5
+env.reward_offroad: 1.5
+env.reward_stop_line: 1.0
+env.reward_comfort: 0.05
+env.reward_lane_align: 0.025
+env.reward_vel_align: 1.0
+env.reward_lane_center: 0.005
+env.reward_velocity: 0.0025
+env.reward_reverse: 0.005
+env.reward_timestep: 2.5e-05
+env.reward_overspeed: 0.05
+
+# Policy — 3x1024 backbone, split actor/critic, gigaflow encoder
+policy.input_size: 256
+policy.backbone_hidden_size: 1024
+policy.backbone_num_layers: 3
+policy.actor_hidden_size: 1024
+policy.actor_num_layers: 0
+policy.critic_hidden_size: 1024
+policy.critic_num_layers: 0
+policy.split_network: true
+policy.encoder_gigaflow: true
+policy.dropout: 0.0
+
+# Training — 10B steps, large minibatch, compiled bfloat16
+train.total_timesteps: 10_000_000_000
+train.learning_rate: 0.0005
+train.minibatch_size: 153600
+train.max_minibatch_size: 153600
+train.update_epochs: 3
+train.bptt_horizon: 128
+train.compile: true
+train.precision: bfloat16
+train.normalize_rewards: false
+train.checkpoint_interval: 500
+train.optimizer: adamw
+
+# Eval — keep validation_gigaflow (CARLA sweep) inline, disable everything else
+# (validation_replay needs nuPlan bins; behaviors_* need labelled scene
+# categories not used in this nightly). Interval 250 keeps eval cost ~5% of
+# wall-clock instead of ~85%.
+eval.validation_defaults.interval: 250
+eval.validation_replay.enabled: 0
+eval.validation_gigaflow.render_backend: egl
+eval.behaviors_full_dir.enabled: 0
+eval.behaviors_hard_stop.enabled: 0
+eval.behaviors_highway_straight.enabled: 0
+eval.behaviors_lane_change.enabled: 0
+eval.behaviors_merge.enabled: 0
+eval.behaviors_parked_cars.enabled: 0
+eval.behaviors_roundabout.enabled: 0
+eval.behaviors_stopped_traffic.enabled: 0
+eval.behaviors_traffic_light_green.enabled: 0
+eval.behaviors_traffic_light_stop.enabled: 0
+eval.behaviors_unprotected_left.enabled: 0
+eval.behaviors_unprotected_right.enabled: 0
+
+# W&B — group has no space (submit_cluster.py joins the inner command
+# without quoting arg values). Launchers (launch_nightly_best.sh and
+# Modal's nightly()) override wandb_group to today's date at launch so
+# runs cluster by night in the UI; the static value here is just the
+# fallback for ad-hoc invocations.
+wandb: True
+wandb_project: nightly-multi
+wandb_group: nightly-multi
diff --git a/scripts/cluster_configs/single_agent_speed_run.yaml b/scripts/cluster_configs/single_agent_speed_run.yaml
@@ -67,6 +67,9 @@ eval.behaviors_unprotected_right.enabled: 0
 
 # W&B. Group has no space: submit_cluster.py joins the inner command into a
 # bash -c string without quoting arg values, so a space would split the arg.
+# Launchers (launch_single_agent.sh and Modal's nightly()) override
+# wandb_group to today's date at launch so runs cluster by night in the UI;
+# the static value here is just the fallback for ad-hoc invocations.
 wandb: True
-wandb_project: single_agent_nightly_test
-wandb_group: Nightly_Test
+wandb_project: nightly-single
+wandb_group: nightly-single
diff --git a/scripts/launch_nightly_best.sh b/scripts/launch_nightly_best.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Launch multi-agent "nightly best" training on the cluster via submit_cluster.py.
+# Mirrors launch_single_agent.sh but uses the oignons2-derived nightly_best.yaml
+# (multi-agent gigaflow over 8 CARLA towns, 10B total steps). Code-isolated per
+# run, container-wrapped, gpu-heartbeated, date-stamped wandb run names.
+#
+# Run on the login node (it sources the venv and submits from there):
+#   ./scripts/launch_nightly_best.sh
+#
+# Overridable via the environment:
+#   PROGRAM_CONFIG  program_config YAML (default: nightly_best.yaml)
+#   SEEDS           colon sweep passed to --args train.seed (default 0:1:2 -> 3 jobs)
+#   ACCOUNT/PARTITION/TIME   SLURM overrides
+#   MEM             SLURM --mem (default 192gb; the multi-agent config plus
+#                   inline validation_gigaflow eval can spike past 128gb at
+#                   epoch 250)
+#   PREFIX          run-name prefix (default <date>_multi_agent)
+#
+# Examples:
+#   SEEDS=0 ./scripts/launch_nightly_best.sh                  # one-seed dry run
+#   PARTITION=h100_tandon ./scripts/launch_nightly_best.sh    # if h200 QOS is full
+set -euo pipefail
+
+PROGRAM_CONFIG="${PROGRAM_CONFIG:-scripts/cluster_configs/nightly_best.yaml}"
+COMPUTE_CONFIG="${COMPUTE_CONFIG:-scripts/cluster_configs/nyu_greene.yaml}"
+ACCOUNT="${ACCOUNT:-torch_pr_924_tandon_advanced}"
+PARTITION="${PARTITION:-h200_tandon}"
+TIME="${TIME:-1800}"
+MEM="${MEM:-192gb}"
+SEEDS="${SEEDS:-0:1:2}"
+PREFIX="${PREFIX:-$(date +%Y-%m-%d)_multi_agent}"
+DATE_STAMP="$(date +%Y-%m-%d)"
+
+source "/scratch/$USER/venvs/pufferdrive/bin/activate"
+
+# One submission per seed so we can pass a per-seed run_name (wandb display
+# name like 2026-05-31_seed0).
+IFS=':' read -ra SEED_LIST <<< "$SEEDS"
+for SEED in "${SEED_LIST[@]}"; do
+    python scripts/submit_cluster.py \
+        --save_dir "/scratch/$USER/runs" \
+        --prefix "$PREFIX" \
+        --compute_config "$COMPUTE_CONFIG" \
+        --program_config "$PROGRAM_CONFIG" \
+        --container --heartbeat \
+        --account "$ACCOUNT" --partition "$PARTITION" --time "$TIME" --mem "$MEM" \
+        --args "train.seed=$SEED" "run_name=${DATE_STAMP}_seed${SEED}" "wandb_group=${DATE_STAMP}"
+done
diff --git a/scripts/launch_single_agent.sh b/scripts/launch_single_agent.sh
@@ -41,5 +41,5 @@ for SEED in "${SEED_LIST[@]}"; do
         --program_config "$PROGRAM_CONFIG" \
         --container --heartbeat \
         --account "$ACCOUNT" --partition "$PARTITION" --time "$TIME" \
-        --args "train.seed=$SEED" "run_name=${DATE_STAMP}_seed${SEED}"
+        --args "train.seed=$SEED" "run_name=${DATE_STAMP}_seed${SEED}" "wandb_group=${DATE_STAMP}"
 done
diff --git a/scripts/modal/Dockerfile b/scripts/modal/Dockerfile
@@ -0,0 +1,90 @@
+# Base image for PufferDrive nightly training on Modal.
+#
+# Matches the NYU Greene Singularity sif as closely as possible:
+#   - CUDA 12.8.1 + cuDNN runtime
+#   - Ubuntu 24.04
+#   - Python 3.12
+#
+# The actual repo (and the built .so files) are baked into the image at deploy
+# time by scripts/modal/modal_app.py via copy_local_dir + run_commands. This
+# Dockerfile only handles the slow, version-stable layer: system libs, Python,
+# torch.
+#
+# Build context is the repo root (not scripts/modal/). Don't reference
+# repo files here — modal_app.py copies them in afterward so changes don't
+# invalidate the slow base.
+
+FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONNOUSERSITE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    # Multi-arch so the same image runs on T4 (sm_75), A100 (sm_80), L4/L40S
+    # (sm_89), and H100/H200 (sm_90). Build is ~3-4x slower than single-arch
+    # but lets us swap Modal gpu= strings without rebuilding the image.
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.9;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        clang \
+        ccache \
+        ca-certificates \
+        curl \
+        git \
+        ninja-build \
+        pkg-config \
+        # OpenMP for the drive C extension
+        libomp-dev \
+        # EGL + GL for the headless render path in drive.h (eval.validation_gigaflow)
+        libegl1 \
+        libegl-dev \
+        libgles2-mesa-dev \
+        libgl1-mesa-dev \
+        libglvnd-dev \
+        # Raylib's X11 deps — used by the GLFW fallback when EGL isn't picked
+        libx11-dev \
+        libxcursor-dev \
+        libxinerama-dev \
+        libxi-dev \
+        libxrandr-dev \
+        # Headless display server for the GLFW fallback path
+        xvfb \
+        # ffmpeg for validation_gigaflow video encoding
+        ffmpeg \
+        # Python toolchain
+        python3.12 \
+        python3.12-dev \
+        python3.12-venv \
+        python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install uv — faster, deterministic resolver, replaces pip + venv + the
+# constraints-file dance. Modal disallows the Dockerfile ADD directive, so
+# fetch via curl.
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:${PATH}"
+
+# Project venv at /opt/venv (kept off the overlay/image filesystem in
+# spirit — symbolic of the cluster's /scratch/$USER/venvs layout).
+RUN uv venv /opt/venv --python 3.12
+ENV PATH="/opt/venv/bin:${PATH}" \
+    VIRTUAL_ENV="/opt/venv"
+
+RUN uv pip install setuptools wheel
+
+# torch from pypi is built against CUDA 13; pull the cu128 wheel instead so it
+# matches the base image's CUDA toolkit. setup.py's CUDAExtension path checks
+# this and refuses to build if torch and host CUDA disagree.
+RUN uv pip install --index-url https://download.pytorch.org/whl/cu128 torch
+
+# Pin numpy<2 and pandas<2.2 here so the later `uv pip install -e .` resolve
+# can't drag numpy 2 in via pandas 3 and break the C extension's numpy 1 ABI
+# (NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION in setup.py). uv's resolver
+# respects these caps when computing the install_requires solution, unlike
+# pip which would gladly upgrade pre-installed pandas during -e .
+RUN uv pip install "numpy<2" "pandas<2.2"
+
+WORKDIR /workspace
diff --git a/scripts/modal/README.md b/scripts/modal/README.md
@@ -0,0 +1,72 @@
+# Modal nightly training
+
+Nightly cron that launches PufferDrive training on Modal — 3 seeds of
+`single_agent_speed_run.yaml` plus 3 seeds of `nightly_best.yaml`, all on
+1× A100-80GB in parallel. Single-agent runs log to wandb project
+`nightly-single`; multi-agent to `nightly-multi`. In each project the
+`wandb_group` is today's UTC date so a night's 3 seeds cluster together in
+the UI. All under the `emerge_` org.
+
+## Files
+
+| File | Purpose |
+|---|---|
+| `Dockerfile` | CUDA 12.8.1 + cuDNN + Ubuntu 24.04 base, system libs, Python 3.12, torch. Slow layer — rarely rebuilt. |
+| `modal_app.py` | Modal app — bakes the repo + builds C extensions on top of the Dockerfile, defines the per-seed `train` function and the `nightly` cron entrypoint. |
+
+The training yamls themselves live in `scripts/cluster_configs/` and are shared
+with the Greene-side launcher.
+
+## One-time setup
+
+```bash
+# Install + auth Modal CLI (host machine)
+pip install modal
+modal token new
+
+# Create the wandb secret. Paste the API key from https://wandb.ai/authorize.
+modal secret create wandb-emerge WANDB_API_KEY=<key>
+```
+
+## Deploy the nightly cron
+
+```bash
+modal deploy scripts/modal/modal_app.py
+```
+
+Modal hashes the source — re-run after any code change to rebuild the image
+and update the deployed cron. The first deploy builds the Dockerfile (~5 min);
+subsequent deploys only rebuild the `pip install -e .` layer when repo files
+change (~1 min).
+
+The cron is `0 4 * * *` (04:00 UTC daily). Adjust the `modal.Cron(...)` arg in
+`modal_app.py` to change the wall-clock time.
+
+## Trigger runs manually
+
+```bash
+# Run the full 6-job fan-out now (without waiting for cron):
+modal run scripts/modal/modal_app.py::nightly
+
+# Run a single seed/config (useful for smoke tests):
+modal run scripts/modal/modal_app.py::train \
+    --yaml-path scripts/cluster_configs/single_agent_speed_run.yaml \
+    --seed 0 --run-name local_smoke --wandb-group smoke
+```
+
+## Inspect / cancel
+
+```bash
+modal app list                                      # show deployed apps
+modal app logs pufferdrive-nightly                  # tail logs (running app)
+modal app stop pufferdrive-nightly                  # remove the cron
+```
+
+Per-container logs (one per training run) appear in the Modal dashboard
+under the `pufferdrive-nightly` app.
+
+## Cost note
+
+A100-80GB on Modal is ~$3.20/h. A 12 h training run × 6 jobs = ~$230/night.
+Bring down by lowering `train.total_timesteps` in the yamls, dropping the
+`--gpu` to `A100` (40GB), or limiting to fewer seeds.