diff --git a/.claude/skills/debug/SKILL.md b/.claude/skills/debug/SKILL.md new file mode 100644 index 0000000000..66709da58f --- /dev/null +++ b/.claude/skills/debug/SKILL.md @@ -0,0 +1,33 @@ +--- +name: debug +description: Run commands inside a remote Docker container via the file-based command relay (tools/debugger). Use when the user says "run in Docker", "run on GPU", "debug remotely", "run test in container", "check nvidia-smi", "run pytest in Docker", or needs to execute any command inside a Docker container that shares the repo filesystem. Requires the user to have started server.sh inside the container first. +--- + +# Remote Docker Debugger + +Execute commands inside a Docker container from the host using the file-based command relay. + +**Read `tools/debugger/CLAUDE.md` for full usage details** — it has the protocol, examples, and troubleshooting. + +## Quick Reference + +```bash +# Check connection +bash tools/debugger/client.sh status + +# Connect to server (user must start server.sh in Docker first) +bash tools/debugger/client.sh handshake + +# Run a command +bash tools/debugger/client.sh run "" + +# Long-running command (default timeout is 600s) +bash tools/debugger/client.sh --timeout 1800 run "" + +# Cancel the currently running command +bash tools/debugger/client.sh cancel + +# Reconnect after server restart +bash tools/debugger/client.sh flush +bash tools/debugger/client.sh handshake +``` diff --git a/tools/debugger/CLAUDE.md b/tools/debugger/CLAUDE.md index ab18e2627c..f9a25534f2 100644 --- a/tools/debugger/CLAUDE.md +++ b/tools/debugger/CLAUDE.md @@ -53,10 +53,20 @@ bash tools/debugger/client.sh run "nvidia-smi" bash tools/debugger/client.sh run "python script.py --model /hf-local/Qwen/Qwen3-8B" ``` +### Cancelling Commands + +```bash +# Cancel the currently running command +bash tools/debugger/client.sh cancel + +# Client-side timeout also auto-cancels the running command +``` + ### Important Notes - The server must be started by the user manually inside Docker before the handshake. - Default command timeout is 600 seconds (10 minutes). Use `--timeout` for longer tasks. - Commands execute sequentially — one at a time. +- A running command can be cancelled; cancelled commands exit with code 130. - All commands run with the auto-detected repo root as the working directory. - The `.relay/` directory is ephemeral and git-ignored. diff --git a/tools/debugger/README.md b/tools/debugger/README.md index ca7a90eb05..426693e065 100644 --- a/tools/debugger/README.md +++ b/tools/debugger/README.md @@ -52,6 +52,9 @@ bash tools/debugger/client.sh run "bash llm_ptq/scripts/huggingface_example.sh" # Run with a long timeout (default is 600s) bash tools/debugger/client.sh --timeout 1800 run "python my_long_test.py" +# Cancel a running command +bash tools/debugger/client.sh cancel + # Check status bash tools/debugger/client.sh status ``` @@ -65,6 +68,8 @@ The relay uses a directory at `tools/debugger/.relay/` with this structure: ├── server.ready # Written by server on startup ├── client.ready # Written by client during handshake ├── handshake.done # Written by server to confirm handshake +├── running # Written by server while a command is executing (cmd_id:pid) +├── cancel # Written by client to request cancellation of the running command ├── cmd/ # Client writes command .sh files here │ └── .sh # Command to execute └── result/ # Server writes results here @@ -82,9 +87,16 @@ The relay uses a directory at `tools/debugger/.relay/` with this structure: ### Command Execution 1. Client writes a command to `.relay/cmd/.sh` -2. Server detects the file, runs `bash ` in the workdir, captures output +2. Server detects the file, runs `bash ` in the workdir in background, writes `.relay/running` 3. Server writes `.relay/result/.log` and `.relay/result/.exit` -4. Server removes the `.sh` file; client reads results and cleans up +4. Server removes the `.sh` file and `.relay/running`; client reads results and cleans up + +### Cancellation + +1. Client writes `.relay/cancel` +2. Server detects the cancel signal, kills the running command process tree +3. Server writes exit code 130 and removes `.relay/running` and `.relay/cancel` +4. Client-side timeout also triggers cancellation automatically ## Options @@ -107,3 +119,5 @@ The relay uses a directory at `tools/debugger/.relay/` with this structure: - The `.relay/` directory is in `.gitignore` — it is not checked in. - Only one server should run at a time (startup clears the relay directory). - Commands run sequentially in the order the server discovers them. +- A running command can be cancelled via `client.sh cancel`. Cancelled commands exit with code 130. +- Client-side timeouts automatically cancel the running command on the server. diff --git a/tools/debugger/client.sh b/tools/debugger/client.sh index 299bd52e82..d0e642e5b7 100755 --- a/tools/debugger/client.sh +++ b/tools/debugger/client.sh @@ -21,6 +21,7 @@ # Usage: # bash client.sh handshake - Connect to server # bash client.sh run - Run a command and print output +# bash client.sh cancel - Cancel the running command # bash client.sh status - Check server status # # Options: @@ -91,6 +92,8 @@ case "$SUBCOMMAND" in # Generate a unique command ID (timestamp + PID to avoid collisions) cmd_id="$(date +%s%N)_$$" + echo "[client] Running: $*" + # Write the command file atomically (tmp + mv) echo "$*" > "$CMD_DIR/$cmd_id.sh.tmp" mv "$CMD_DIR/$cmd_id.sh.tmp" "$CMD_DIR/$cmd_id.sh" @@ -108,14 +111,32 @@ case "$SUBCOMMAND" in elapsed=$((elapsed + POLL_INTERVAL)) if [[ $elapsed -ge $TIMEOUT ]]; then echo "ERROR: Command timed out after ${TIMEOUT}s." - # Clean up the pending command + # Cancel the running command only if it is OUR command + if [[ -f "$RELAY_DIR/running" ]]; then + running_info=$(cat "$RELAY_DIR/running" 2>/dev/null) || true + running_id="${running_info%%:*}" + if [[ "$running_id" == "$cmd_id" ]]; then + echo "Sending cancel signal..." + echo "$cmd_id" > "$RELAY_DIR/cancel" + for _ in $(seq 1 10); do + [[ -f "$RELAY_DIR/running" ]] || break + sleep 1 + done + fi + fi + # Clean up command and any orphaned result files rm -f "$CMD_DIR/$cmd_id.sh" + rm -f "$RESULT_DIR/$cmd_id.exit" "$RESULT_DIR/$cmd_id.log" exit 1 fi done # Read and display results exit_code=$(cat "$RESULT_DIR/$cmd_id.exit") + if ! [[ "$exit_code" =~ ^[0-9]+$ ]]; then + echo "WARNING: Invalid exit code '$exit_code', defaulting to 1." + exit_code=1 + fi if [[ -f "$RESULT_DIR/$cmd_id.log" ]]; then cat "$RESULT_DIR/$cmd_id.log" fi @@ -139,6 +160,11 @@ case "$SUBCOMMAND" in else echo "Handshake: not started" fi + if [[ -f "$RELAY_DIR/running" ]]; then + echo "Running: $(cat "$RELAY_DIR/running")" + else + echo "Running: (idle)" + fi if [[ -d "$CMD_DIR" ]]; then pending=$(find "$CMD_DIR" -maxdepth 1 -type f -name '*.sh' 2>/dev/null | wc -l) else @@ -148,6 +174,10 @@ case "$SUBCOMMAND" in ;; flush) + if [[ -f "$RELAY_DIR/running" ]]; then + echo "ERROR: A command is currently running. Cancel it first or wait for it to finish." + exit 1 + fi if [[ -d "$RELAY_DIR" ]]; then # Clear handshake and command/result files, but keep server.ready rm -f "$RELAY_DIR/client.ready" "$RELAY_DIR/handshake.done" @@ -159,12 +189,47 @@ case "$SUBCOMMAND" in fi ;; + cancel) + # Check if there's a running command + if [[ -f "$RELAY_DIR/running" ]]; then + running_info=$(cat "$RELAY_DIR/running" 2>/dev/null) || true + running_id="${running_info%%:*}" + echo "Cancelling running command: $running_id" + + # Write cancel signal with cmd_id so server can verify the target + echo "$running_id" > "$RELAY_DIR/cancel" + + # Wait for the server to process the cancellation + elapsed=0 + while [[ -f "$RELAY_DIR/running" ]]; do + sleep "$POLL_INTERVAL" + elapsed=$((elapsed + POLL_INTERVAL)) + if [[ $elapsed -ge 30 ]]; then + echo "WARNING: Cancel signal sent but command still running after 30s." + exit 1 + fi + done + echo "Command cancelled." + else + echo "No command is currently running." + fi + + # Report pending commands + if [[ -d "$CMD_DIR" ]]; then + pending=$(find "$CMD_DIR" -maxdepth 1 -type f -name '*.sh' 2>/dev/null | wc -l) + if [[ "$pending" -gt 0 ]]; then + echo "$pending pending command(s) in queue. Use 'flush' to clear them." + fi + fi + ;; + *) echo "Usage: $0 [--relay-dir ] [--timeout ] " echo "" echo "Subcommands:" echo " handshake Connect to the server" echo " run Execute a command on the server" + echo " cancel Cancel the currently running command" echo " status Check connection status" echo " flush Clear the relay directory" exit 1 diff --git a/tools/debugger/server.sh b/tools/debugger/server.sh index eefc849ae6..abcd7916d1 100755 --- a/tools/debugger/server.sh +++ b/tools/debugger/server.sh @@ -63,10 +63,19 @@ RESULT_DIR="$RELAY_DIR/result" cleanup() { echo "[server] Shutting down..." + # Kill any running command (guard all reads with || true to prevent set -e + # from aborting the trap and leaving stale marker files) + running_pid=$(cut -d: -f2 "$RELAY_DIR/running" 2>/dev/null) || true + if [[ -n "$running_pid" ]]; then + pkill -P "$running_pid" 2>/dev/null || true + kill "$running_pid" 2>/dev/null || true + fi # Kill any child processes in our process group pkill -P $$ 2>/dev/null || true rm -f "$RELAY_DIR/server.ready" rm -f "$RELAY_DIR/handshake.done" + rm -f "$RELAY_DIR/running" + rm -f "$RELAY_DIR/cancel" exit 0 } trap cleanup SIGINT SIGTERM @@ -87,17 +96,28 @@ fi rm -rf "$RELAY_DIR" mkdir -p "$CMD_DIR" "$RESULT_DIR" -# Install modelopt in editable mode (skip if already editable-installed from WORKDIR) -if python -c " -import modelopt, os -assert os.path.realpath(modelopt.__path__[0]).startswith(os.path.realpath('$WORKDIR')) -" 2>/dev/null; then +# Ensure modelopt is editable-installed from WORKDIR +check_modelopt_local() { + python -c " +import modelopt, os, sys +actual = os.path.realpath(modelopt.__path__[0]) +expected = os.path.realpath('$WORKDIR') +if not actual.startswith(expected): + print(f'modelopt loaded from {actual}, expected under {expected}', file=sys.stderr) + sys.exit(1) +" 2>&1 +} + +if check_modelopt_local >/dev/null 2>&1; then echo "[server] modelopt already editable-installed from $WORKDIR, skipping pip install." else echo "[server] Installing modelopt (pip install -e .[dev]) ..." - (cd "$WORKDIR" && pip install -e ".[dev]") || { - echo "[server] WARNING: pip install failed (exit=$?), continuing anyway." - } + (cd "$WORKDIR" && pip install -e ".[dev]") + if ! check_modelopt_local; then + echo "[server] ERROR: modelopt is not running from the local folder ($WORKDIR)." + echo "[server] Try: pip install -e '.[dev]' inside the container, then restart the server." + exit 1 + fi echo "[server] Install done." fi @@ -129,19 +149,90 @@ while true; do fi for cmd_file in "$CMD_DIR"/*.sh; do - cmd_id="$(basename "$cmd_file" .sh)" - echo "[server] Executing command $cmd_id..." - - # Execute the command, tee stdout+stderr to console and result file - (cd "$WORKDIR" && bash "$cmd_file" 2>&1) | tee "$RESULT_DIR/$cmd_id.log" || true - exit_code=${PIPESTATUS[0]} + # Guard against command files deleted by the client between glob expansion + # and processing (e.g., client timeout on a queued command) + [[ -f "$cmd_file" ]] || continue - # Atomic write of exit code (signal to client that result is ready) + cmd_id="$(basename "$cmd_file" .sh)" + # Tolerate file disappearing between guard and read (TOCTOU with client timeout) + cmd_content=$(cat "$cmd_file" 2>/dev/null) || continue + # Remove command file immediately after reading to prevent re-execution + # and to avoid TOCTOU with client timeout deleting it during execution + rm -f "$cmd_file" + echo "[server] Executing command $cmd_id: $cmd_content" + + # Clear any stale cancel file from a previous timed-out client + rm -f "$RELAY_DIR/cancel" + + # Create log file and stream output to server console via tail + : > "$RESULT_DIR/$cmd_id.log" + tail -f "$RESULT_DIR/$cmd_id.log" & + tail_pid=$! + + # Run from cmd_content (not the file) since we already removed it + (cd "$WORKDIR" && bash -c "$cmd_content") >> "$RESULT_DIR/$cmd_id.log" 2>&1 & + cmd_pid=$! + + # Track the running command (ID and PID) — atomic write to prevent partial reads + echo "$cmd_id:$cmd_pid" > "$RELAY_DIR/running.tmp" + mv "$RELAY_DIR/running.tmp" "$RELAY_DIR/running" + + # Wait for completion or cancellation + cancelled="" + while kill -0 "$cmd_pid" 2>/dev/null; do + if [[ -f "$RELAY_DIR/cancel" ]]; then + # Verify cancel targets this command (reject empty or mismatched signals) + cancel_target=$(cat "$RELAY_DIR/cancel" 2>/dev/null) || true + if [[ "$cancel_target" != "$cmd_id" ]]; then + rm -f "$RELAY_DIR/cancel" + sleep "$POLL_INTERVAL" + continue + fi + echo "[server] Cancelling command $cmd_id (PID $cmd_pid)..." + # Send SIGTERM to children first, then parent + pkill -P "$cmd_pid" 2>/dev/null || true + kill "$cmd_pid" 2>/dev/null || true + # Wait up to 5s for graceful exit, then escalate to SIGKILL + for _ in $(seq 1 5); do + kill -0 "$cmd_pid" 2>/dev/null || break + sleep 1 + done + if kill -0 "$cmd_pid" 2>/dev/null; then + echo "[server] Process $cmd_pid did not exit, sending SIGKILL..." + pkill -9 -P "$cmd_pid" 2>/dev/null || true + kill -9 "$cmd_pid" 2>/dev/null || true + fi + wait "$cmd_pid" 2>/dev/null || true + cancelled="true" + rm -f "$RELAY_DIR/cancel" + echo "[cancelled]" >> "$RESULT_DIR/$cmd_id.log" + echo "[server] Command $cmd_id cancelled." + break + fi + sleep "$POLL_INTERVAL" + done + + # Determine exit code (|| exit_code=$? prevents set -e from killing the + # server when the command exits non-zero) + if [[ -n "$cancelled" ]]; then + exit_code=130 + else + exit_code=0 + wait "$cmd_pid" 2>/dev/null || exit_code=$? + fi + + # Stop console streaming + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + + # Write exit code BEFORE removing the running marker, so any observer + # that sees running disappear can immediately find the result echo "$exit_code" > "$RESULT_DIR/$cmd_id.exit.tmp" mv "$RESULT_DIR/$cmd_id.exit.tmp" "$RESULT_DIR/$cmd_id.exit" - # Remove the command file to mark it as processed - rm -f "$cmd_file" + # Now safe to remove markers + rm -f "$RELAY_DIR/running" + rm -f "$RELAY_DIR/cancel" echo "[server] Command $cmd_id finished (exit=$exit_code)" done